diff --git a/BUILD.md b/BUILD.md index c4d8b1b356..c94bb24204 100644 --- a/BUILD.md +++ b/BUILD.md @@ -5,9 +5,12 @@ - [Build Dependencies](#required_depenencies) - [Header-only C++](#install_header_only_cpp) - [C++ Shared Libraries](#shared_cpp_libs) + - [Improving Rebuild Times](#ccache) - [Googletests](#gtests) + - [Googlebench](#gbench) - [C++ Using Cmake](#cpp_using_cmake) - [Python](#python) + - [Documentation](#docs) - [Using RAFT in downstream projects](#use_raft) - [Cmake Header-only Integration](#cxx_integration) - [Using Shared Libraries in Cmake](#use_shared_libs) @@ -27,15 +30,14 @@ In addition to the libraries included with cudatoolkit 11.0+, there are some oth #### Required - [RMM](https://github.com/rapidsai/rmm) corresponding to RAFT version. - + #### Optional -- [mdspan](https://github.com/rapidsai/mdspan) - On by default but can be disabled. - [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub) - On by default but can be disabled. - [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API. - [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0 - [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API and needed to build tests. -- [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `Pyraft` -- [UCX](https://github.com/openucx/ucx) - Used in `raft::comms` API and needed to build `Pyraft` +- [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `raft-dask` +- [UCX](https://github.com/openucx/ucx) - Used in `raft::comms` API and needed to build `raft-dask` - [Googletest](https://github.com/google/googletest) - Needed to build tests - [Googlebench](https://github.com/google/benchmark) - Needed to build benchmarks - [Doxygen](https://github.com/doxygen/doxygen) - Needed to build docs @@ -53,11 +55,6 @@ The following example will download the needed dependencies and install the RAFT ./build.sh libraft --install ``` -The `--minimal-deps` flag can be used to install the headers with minimal dependencies: -```bash -./build.sh libraft --install --minimal-deps -``` - ### C++ Shared Libraries (optional) For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`: @@ -72,6 +69,14 @@ Individual shared libraries have their own flags and multiple can be used (thoug Add the `--install` flag to the above example to also install the shared libraries into `$INSTALL_PREFIX/lib`. +### `ccache` and `sccache` + +`ccache` and `sccache` can be used to better cache parts of the build when rebuilding frequently, such as when working on a new feature. You can also use `ccache` or `sccache` with `build.sh`: + +```bash +./build.sh libraft --cache-tool=ccache +``` + ### Tests Compile the tests using the `tests` target in `build.sh`. @@ -86,23 +91,30 @@ Test compile times can be improved significantly by using the optional shared li ./build.sh libraft tests --compile-libs ``` -To run C++ tests: +The tests are broken apart by algorithm category, so you will find several binaries in `cpp/build/` named `*_TEST`. + +For example, to run the distance tests: +```bash +./cpp/build/DISTANCE_TEST +``` + +It can take sometime to compile all of the tests. You can build individual tests by providing a semicolon-separated list to the `--limit-tests` option in `build.sh`: ```bash -./cpp/build/test_raft +./build.sh libraft tests --limit-tests=SPATIAL_TEST;DISTANCE_TEST;MATRIX_TEST ``` -### Benchmarks +### Benchmarks -Compile the benchmarks using the `bench` target in `build.sh`: +The benchmarks are broken apart by algorithm category, so you will find several binaries in `cpp/build/` named `*_BENCH`. ```bash ./build.sh libraft bench ``` -To run the benchmarks: +It can take sometime to compile all of the tests. You can build individual tests by providing a semicolon-separated list to the `--limit-tests` option in `build.sh`: ```bash -./cpp/build/bench_raft +./build.sh libraft bench --limit-bench=SPATIAL_BENCH;DISTANCE_BENCH;LINALG_BENCH ``` ### C++ Using Cmake @@ -128,10 +140,7 @@ RAFT's cmake has the following configurable flags available:. | RAFT_COMPILE_DIST_LIBRARY | ON, OFF | OFF | Compiles the `libraft-distance` shared library | | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. Needed for `raft::spatial::knn` | | RAFT_ENABLE_thrust_DEPENDENCY | ON, OFF | ON | Enables the Thrust dependency. This can be disabled when using many simple utilities or to override with a different Thrust version. | -| RAFT_ENABLE_mdspan_DEPENDENCY | ON, OFF | ON | Enables the std::mdspan dependency. This can be disabled when using many simple utilities. | -| RAFT_ENABLE_nccl_DEPENDENCY | ON, OFF | OFF | Enables NCCL dependency used by `raft::comms` and needed to build `pyraft` | -| RAFT_ENABLE_ucx_DEPENDENCY | ON, OFF | OFF | Enables UCX dependency used by `raft::comms` and needed to build `pyraft` | -| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | +| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | | RAFT_STATIC_LINK_LIBRARIES | ON, OFF | ON | Build static link libraries instead of shared libraries | | DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies | | NVTX | ON, OFF | OFF | Enable NVTX Markers | @@ -143,22 +152,26 @@ Currently, shared libraries are provided for the `libraft-nn` and `libraft-dista ### Python -Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. The following example will install create and install dependencies for a CUDA 11.5 conda environment: +Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.5 conda environment: ```bash mamba env create --name raft_env_name -f conda/environments/raft_dev_cuda11.5.yml mamba activate raft_env_name ``` -The Python APIs can be built using the `build.sh` script: +The Python APIs can be built and installed using the `build.sh` script: ```bash -./build.sh pyraft pylibraft +# to build pylibraft +./build.sh libraft pylibraft --install --compile-libs +# to build raft-dask +./build.sh libraft raft-dask --install --compile-libs ``` `setup.py` can also be used to build the Python APIs manually: -```bash -cd python/raft + +``` +cd python/raft-dask python setup.py build_ext --inplace python setup.py install @@ -169,16 +182,28 @@ python setup.py install To run the Python tests: ```bash -cd python/raft -py.test -s -v raft +cd python/raft-dask +py.test -s -v + +cd python/pylibraft +py.test -s -v +``` + +### Documentation + +The documentation requires that the C++ headers and python packages have been built and installed. + +The following will build the docs along with the C++ and Python packages: -cd python pylibraft -py.test -s -v pylibraft ``` +./build.sh libraft pylibraft raft-dask docs --compile-libs --install +``` + + ## Using RAFT in downstream projects -There are two different strategies for including RAFT in downstream projects, depending on whether or not the required dependencies are already installed and available on the `lib` and `include` paths. +There are two different strategies for including RAFT in downstream projects, depending on whether or not the required dependencies are already installed and available on the `lib` and `include` paths. ### C++ header-only integration using cmake @@ -187,7 +212,7 @@ When the needed [build dependencies](#required_depenencies) are already satisfie set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo") ExternalProject_Add(raft GIT_REPOSITORY git@github.com:rapidsai/raft.git - GIT_TAG branch-22.04 + GIT_TAG branch-22.10 PREFIX ${RAFT_GIT_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -205,13 +230,13 @@ The pre-compiled libraries contain template specializations for commonly used ty The following example tells the compiler to ignore the pre-compiled templates for the `libraft-distance` API so any symbols already compiled into pre-compiled shared library will be used instead: ```c++ -#include -#include +#include +#include ``` ### Building RAFT C++ from source in cmake -RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). +RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to invoking `find_package(raft)` but uses `rapids_cpm_find`, which provides a richer and more flexible configuration landscape by using CPM to fetch any dependencies not already available to the build. The `raft::raft` link target will be made available and it's recommended that it be used as a `PRIVATE` link dependency in downstream projects. The `COMPILE_LIBRARIES` option enables the building the shared libraries. @@ -219,15 +244,15 @@ The following `cmake` snippet enables a flexible configuration of RAFT: ```cmake -set(RAFT_VERSION "22.04") +set(RAFT_VERSION "22.10") set(RAFT_FORK "rapidsai") set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}") function(find_and_configure_raft) set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES CLONE_ON_PIN - USE_NN_LIBRARY USE_DISTANCE_LIBRARY - ENABLE_thrust_DEPENDENCY ENABLE_mdspan_DEPENDENCY) + USE_NN_LIBRARY USE_DISTANCE_LIBRARY + ENABLE_thrust_DEPENDENCY) cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} ) @@ -272,7 +297,6 @@ function(find_and_configure_raft) "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}" "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}" "RAFT_ENABLE_thrust_DEPENDENCY ${PKG_ENABLE_thrust_DEPENDENCY}" - "RAFT_ENABLE_mdspan_DEPENDENCY ${PKG_ENABLE_mdspan_DEPENDENCY}" ) endfunction() @@ -295,7 +319,6 @@ find_and_configure_raft(VERSION ${RAFT_VERSION}.00 ENABLE_NN_DEPENDENCIES NO # This builds FAISS if not installed USE_FAISS_STATIC NO ENABLE_thrust_DEPENDENCY YES - ENABLE_mdspan_DEPENDENCY YES ) ``` diff --git a/CHANGELOG.md b/CHANGELOG.md index b341367022..ac1e8581df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,79 @@ +# raft 22.10.00 (12 Oct 2022) + +## 🚨 Breaking Changes + +- Separating mdspan/mdarray infra into host_* and device_* variants ([#810](https://github.com/rapidsai/raft/pull/810)) [@cjnolet](https://github.com/cjnolet) +- Remove type punning from TxN_t ([#781](https://github.com/rapidsai/raft/pull/781)) [@wphicks](https://github.com/wphicks) +- ivf_flat::index: hide implementation details ([#747](https://github.com/rapidsai/raft/pull/747)) [@achirkin](https://github.com/achirkin) + +## 🐛 Bug Fixes + +- ivf-pq integration: hotfixes ([#891](https://github.com/rapidsai/raft/pull/891)) [@achirkin](https://github.com/achirkin) +- Removing cub symbol from libraft-distance instantiation. ([#887](https://github.com/rapidsai/raft/pull/887)) [@cjnolet](https://github.com/cjnolet) +- ivf-pq post integration hotfixes ([#878](https://github.com/rapidsai/raft/pull/878)) [@achirkin](https://github.com/achirkin) +- Fixing a few compile errors in new APIs ([#874](https://github.com/rapidsai/raft/pull/874)) [@cjnolet](https://github.com/cjnolet) +- Include knn.cuh in knn.cu benchmark source for finding brute_force_knn ([#855](https://github.com/rapidsai/raft/pull/855)) [@teju85](https://github.com/teju85) +- Do not use strcpy to copy 2 char ([#848](https://github.com/rapidsai/raft/pull/848)) [@mhoemmen](https://github.com/mhoemmen) +- rng_state not including necessary cstdint ([#839](https://github.com/rapidsai/raft/pull/839)) [@MatthiasKohl](https://github.com/MatthiasKohl) +- Fix integer overflow in ANN kmeans ([#835](https://github.com/rapidsai/raft/pull/835)) [@Nyrio](https://github.com/Nyrio) +- Add alignment to the TxN_t vectorized type ([#792](https://github.com/rapidsai/raft/pull/792)) [@achirkin](https://github.com/achirkin) +- Fix adj_to_csr_kernel ([#785](https://github.com/rapidsai/raft/pull/785)) [@ahendriksen](https://github.com/ahendriksen) +- Use rapids-cmake 22.10 best practice for RAPIDS.cmake location ([#784](https://github.com/rapidsai/raft/pull/784)) [@robertmaynard](https://github.com/robertmaynard) +- Remove type punning from TxN_t ([#781](https://github.com/rapidsai/raft/pull/781)) [@wphicks](https://github.com/wphicks) +- Various fixes for build.sh ([#771](https://github.com/rapidsai/raft/pull/771)) [@vyasr](https://github.com/vyasr) + +## 📖 Documentation + +- Fix target names in build.sh help text ([#879](https://github.com/rapidsai/raft/pull/879)) [@Nyrio](https://github.com/Nyrio) +- Document that minimum required CMake version is now 3.23.1 ([#841](https://github.com/rapidsai/raft/pull/841)) [@robertmaynard](https://github.com/robertmaynard) + +## 🚀 New Features + +- mdspanify raft::random functions uniformInt, normalTable, fill, bernoulli, and scaled_bernoulli ([#897](https://github.com/rapidsai/raft/pull/897)) [@mhoemmen](https://github.com/mhoemmen) +- mdspan-ify several raft::random rng functions ([#857](https://github.com/rapidsai/raft/pull/857)) [@mhoemmen](https://github.com/mhoemmen) +- Develop new mdspan-ified multi_variable_gaussian interface ([#845](https://github.com/rapidsai/raft/pull/845)) [@mhoemmen](https://github.com/mhoemmen) +- Mdspanify permute ([#834](https://github.com/rapidsai/raft/pull/834)) [@mhoemmen](https://github.com/mhoemmen) +- mdspan-ify rmat_rectangular_gen ([#833](https://github.com/rapidsai/raft/pull/833)) [@mhoemmen](https://github.com/mhoemmen) +- mdspanify sampleWithoutReplacement ([#830](https://github.com/rapidsai/raft/pull/830)) [@mhoemmen](https://github.com/mhoemmen) +- mdspan-ify make_regression ([#811](https://github.com/rapidsai/raft/pull/811)) [@mhoemmen](https://github.com/mhoemmen) +- Updating `raft::linalg` APIs to use `mdspan` ([#809](https://github.com/rapidsai/raft/pull/809)) [@divyegala](https://github.com/divyegala) +- Integrate KNN implementation: ivf-pq ([#789](https://github.com/rapidsai/raft/pull/789)) [@achirkin](https://github.com/achirkin) + +## 🛠️ Improvements + +- Some fixes for build.sh ([#901](https://github.com/rapidsai/raft/pull/901)) [@cjnolet](https://github.com/cjnolet) +- Revert recent fused l2 nn instantiations ([#899](https://github.com/rapidsai/raft/pull/899)) [@cjnolet](https://github.com/cjnolet) +- Update Python build instructions ([#898](https://github.com/rapidsai/raft/pull/898)) [@betatim](https://github.com/betatim) +- Adding ninja and cxx compilers to conda dev dependencies ([#893](https://github.com/rapidsai/raft/pull/893)) [@cjnolet](https://github.com/cjnolet) +- Output non-normalized distances in IVF-PQ and brute-force KNN ([#892](https://github.com/rapidsai/raft/pull/892)) [@Nyrio](https://github.com/Nyrio) +- Readme updates for 22.10 ([#884](https://github.com/rapidsai/raft/pull/884)) [@cjnolet](https://github.com/cjnolet) +- Breaking apart benchmarks into individual binaries ([#883](https://github.com/rapidsai/raft/pull/883)) [@cjnolet](https://github.com/cjnolet) +- Pin `dask` and `distributed` for release ([#858](https://github.com/rapidsai/raft/pull/858)) [@galipremsagar](https://github.com/galipremsagar) +- Mdspanifying (currently tested) `raft::matrix` ([#846](https://github.com/rapidsai/raft/pull/846)) [@cjnolet](https://github.com/cjnolet) +- Separating _RAFT_HOST and _RAFT_DEVICE macros ([#836](https://github.com/rapidsai/raft/pull/836)) [@cjnolet](https://github.com/cjnolet) +- Updating cpu job in hopes it speeds up python cpu builds ([#828](https://github.com/rapidsai/raft/pull/828)) [@cjnolet](https://github.com/cjnolet) +- Mdspan-ifying `raft::spatial` ([#827](https://github.com/rapidsai/raft/pull/827)) [@cjnolet](https://github.com/cjnolet) +- Fixing __init__.py for handle and stream ([#826](https://github.com/rapidsai/raft/pull/826)) [@cjnolet](https://github.com/cjnolet) +- Moving a few more things around ([#822](https://github.com/rapidsai/raft/pull/822)) [@cjnolet](https://github.com/cjnolet) +- Use fusedL2NN in ANN kmeans ([#821](https://github.com/rapidsai/raft/pull/821)) [@Nyrio](https://github.com/Nyrio) +- Separating test executables ([#820](https://github.com/rapidsai/raft/pull/820)) [@cjnolet](https://github.com/cjnolet) +- Separating mdspan/mdarray infra into host_* and device_* variants ([#810](https://github.com/rapidsai/raft/pull/810)) [@cjnolet](https://github.com/cjnolet) +- Fix malloc/delete mismatch ([#808](https://github.com/rapidsai/raft/pull/808)) [@mhoemmen](https://github.com/mhoemmen) +- Renaming `pyraft` -> `raft-dask` ([#801](https://github.com/rapidsai/raft/pull/801)) [@cjnolet](https://github.com/cjnolet) +- Branch 22.10 merge 22.08 ([#800](https://github.com/rapidsai/raft/pull/800)) [@cjnolet](https://github.com/cjnolet) +- Statically link all CUDA toolkit libraries ([#797](https://github.com/rapidsai/raft/pull/797)) [@trxcllnt](https://github.com/trxcllnt) +- Minor follow-up fixes for ivf-flat ([#796](https://github.com/rapidsai/raft/pull/796)) [@achirkin](https://github.com/achirkin) +- KMeans benchmarks (cuML + ANN implementations) and fix for IndexT=int64_t ([#795](https://github.com/rapidsai/raft/pull/795)) [@Nyrio](https://github.com/Nyrio) +- Optimize fusedL2NN when data is skinny ([#794](https://github.com/rapidsai/raft/pull/794)) [@ahendriksen](https://github.com/ahendriksen) +- Complete the deprecation of duplicated hpp headers ([#793](https://github.com/rapidsai/raft/pull/793)) [@ahendriksen](https://github.com/ahendriksen) +- Prepare parts of the balanced kmeans for ivf-pq ([#788](https://github.com/rapidsai/raft/pull/788)) [@achirkin](https://github.com/achirkin) +- Unpin `dask` and `distributed` for development ([#783](https://github.com/rapidsai/raft/pull/783)) [@galipremsagar](https://github.com/galipremsagar) +- Exposing python wrapper for the RMAT generator logic ([#778](https://github.com/rapidsai/raft/pull/778)) [@teju85](https://github.com/teju85) +- Device, Host, Managed Accessor Types for `mdspan` ([#776](https://github.com/rapidsai/raft/pull/776)) [@divyegala](https://github.com/divyegala) +- Fix Forward-Merger Conflicts ([#768](https://github.com/rapidsai/raft/pull/768)) [@ajschmidt8](https://github.com/ajschmidt8) +- Fea 2208 kmeans use specializations ([#760](https://github.com/rapidsai/raft/pull/760)) [@cjnolet](https://github.com/cjnolet) +- ivf_flat::index: hide implementation details ([#747](https://github.com/rapidsai/raft/pull/747)) [@achirkin](https://github.com/achirkin) + # raft 22.08.00 (17 Aug 2022) ## 🚨 Breaking Changes diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index 5c1e122525..e1dd682fd9 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -4,7 +4,7 @@ Devloping features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts. -The process for working on a CUDA/C++ feature which spans RAFT and one or more consumers can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#install_header_only_cpp) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`. The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream. +The process for working on a CUDA/C++ feature which might span RAFT and one or more consuming libraries can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#install_header_only_cpp) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`. The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream. If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise. @@ -14,6 +14,16 @@ Since RAFT is a core library with multiple consumers, it's important that the pu The public APIs should be lightweight wrappers around calls to private APIs inside the `detail` namespace. +## Common Design Considerations + +1. Use the `hpp` extension for files which can be compiled with `gcc` against the CUDA-runtime. Use the `cuh` extension for files which require `nvcc` to be compiled. `hpp` can also be used for functions marked `__host__ __device__` only if proper checks are in place to remove the `__device__` designation when not compiling with `nvcc`. + +2. When additional classes, structs, or general POCO types are needed to be used for representing data in the public API, place them in a new file called `_types.hpp`. This tells users they are safe to expose these types on their own public APIs without bringing in device code. At a minimum, the definitions for these types, at least, should not require `nvcc`. In general, these classes should only store very simple state and should not perform their own computations. Instead, new functions should be exposed on the public API which accept these objects, reading or updating their state as necessary. + +3. Documentation for public APIs should be well documented, easy to use, and it is highly preferred that they include usage instructions. + +4. Before creating a new primitive, check to see if one exists already. If one exists but the API isn't flexible enough to include your use-case, consider first refactoring the existing primitive. If that is not possible without an extreme number of changes, consider how the public API could be made more flexible. If the new primitive is different enough from all existing primitives, consider whether an existing public API could invoke the new primitive as an option or argument. If the new primitive is different enough from what exists already, add a header for the new public API function to the appropriate subdirectory and namespace. + ## Testing It's important for RAFT to maintain a high test coverage in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. A well-defined public API can help maintain compile-time stability but means more focus should be placed on testing the functional requirements and verifying execution on the various edge cases within RAFT itself. Ideally, bug fixes and new features should be able to be made to RAFT independently of the consuming projects. diff --git a/README.md b/README.md index 2159f128bf..2c0231f37e 100755 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ RAFT provides a header-only C++ library and pre-compiled shared libraries that c RAFT also provides 2 Python libraries: - `pylibraft` - low-level Python wrappers around RAFT algorithms and primitives. -- `pyraft` - reusable infrastructure for building analytics, including tools for building both single-GPU and multi-node multi-GPU algorithms. +- `raft-dask` - reusable infrastructure for building analytics, including tools for building both single-GPU and multi-node multi-GPU algorithms. ## Getting started @@ -39,7 +39,7 @@ The APIs in RAFT currently accept raw pointers to device memory and we are in th The `mdarray` forms a convenience layer over RMM and can be constructed in RAFT using a number of different helper functions: ```c++ -#include +#include int n_rows = 10; int n_cols = 10; @@ -56,8 +56,8 @@ Most of the primitives in RAFT accept a `raft::handle_t` object for the manageme The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing pairwise Euclidean distances: ```c++ -#include -#include +#include +#include #include #include @@ -108,13 +108,15 @@ The easiest way to install RAFT is through conda and several packages are provid - `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives. - `libraft-distance` (optional) contains shared libraries for distance primitives. - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives -- `pyraft` (optional) contains reusable Python infrastructure and tools to accelerate Python algorithm development. +- `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters. -Use the following command to install RAFT with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command. +Use the following command to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command. ```bash -mamba install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft pylibraft +mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft ``` +You can also install the `libraft-*` conda packages individually using the `mamba` command above. + After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed. ### CPM @@ -181,7 +183,7 @@ mamba env create --name raft_dev_env -f conda/environments/raft_dev_cuda11.5.yml mamba activate raft_dev_env ``` ``` -./build.sh pyraft pylibraft libraft tests bench --compile-libs +./build.sh raft-dask pylibraft libraft tests bench --compile-libs ``` The [build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) section of the build instructions. @@ -193,11 +195,18 @@ The folder structure mirrors other RAPIDS repos, with the following folders: - `ci`: Scripts for running CI in PRs - `conda`: Conda recipes and development conda environments - `cpp`: Source code for C++ libraries. - - `docs`: Doxygen configuration - - `include`: The C++ API is fully-contained here - - `src`: Compiled template specializations for the shared libraries + - `bench`: Benchmarks source code + - `cmake`: Cmake modules and templates + - `doxygen`: Doxygen configuration + - `include`: The C++ API headers are fully-contained here + - `scripts`: Helpful scripts for development + - `src`: Compiled APIs and template specializations for the shared libraries + - `test`: Googletests source code - `docs`: Source code and scripts for building library documentation (doxygen + pydocs) - `python`: Source code for Python libraries. + - `pylibraft`: Python build and source code for pylibraft library + - `raft-dask`: Python build and source code for raft-dask library +- `thirdparty`: Third-party licenses ## Contributing diff --git a/build.sh b/build.sh index 8b00fa69dd..a31d97c22c 100755 --- a/build.sh +++ b/build.sh @@ -18,14 +18,14 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps" -HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=] +VALIDARGS="clean libraft pylibraft raft-dask docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps" +HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=] [--limit-tests=] [--limit-bench=] where is: clean - remove all existing build artifacts and configuration (start over) libraft - build the raft C++ code only. Also builds the C-wrapper library around the C++ code. - pyraft - build the pyraft Python package pylibraft - build the pylibraft Python package + raft-dask - build the raft-dask Python package. this also requires pylibraft. docs - build the documentation tests - build the tests bench - build the benchmarks @@ -35,9 +35,13 @@ HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=\"] [--cache-tool==0.29,<0.30 -- cmake>=3.20.1,!=3.23.0 +- cmake>=3.23.1 - scikit-build>=0.13.1 -- rapids-build-env=22.08.* -- rapids-notebook-env=22.08.* -- rapids-doc-env=22.08.* -- rmm=22.08.* -- dask-cuda=22.08.* +- rapids-build-env=22.10.* +- rapids-notebook-env=22.10.* +- rapids-doc-env=22.10.* +- rmm=22.10.* +- dask-cuda=22.10.* - ucx>=1.13.0 -- ucx-py=0.27.* +- ucx-py=0.28.* - ucx-proc=*=gpu - doxygen>=1.8.20 - libfaiss>=1.7.0 @@ -27,8 +30,8 @@ dependencies: - pip: - sphinx_markdown_tables - breathe - - git+https://github.com/dask/dask.git@2022.7.1 - - git+https://github.com/dask/distributed.git@2022.7.1 + - git+https://github.com/dask/dask.git@2022.9.2 + - git+https://github.com/dask/distributed.git@2022.9.2 # rapids-build-env, notebook-env and doc-env are defined in # https://docs.rapids.ai/maintainers/depmgmt/ diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml index 5991e3e370..d8cb5759c1 100644 --- a/conda/environments/raft_dev_cuda11.2.yml +++ b/conda/environments/raft_dev_cuda11.2.yml @@ -5,19 +5,22 @@ channels: - rapidsai-nightly - conda-forge dependencies: +- c-compiler +- cxx-compiler - cudatoolkit=11.2 +- ninja - clang=11.1.0 - clang-tools=11.1.0 - cython>=0.29,<0.30 -- cmake>=3.20.1,!=3.23.0 +- cmake>=3.23.1 - scikit-build>=0.13.1 -- rapids-build-env=22.08.* -- rapids-notebook-env=22.08.* -- rapids-doc-env=22.08.* -- rmm=22.08.* -- dask-cuda=22.08.* +- rapids-build-env=22.10.* +- rapids-notebook-env=22.10.* +- rapids-doc-env=22.10.* +- rmm=22.10.* +- dask-cuda=22.10.* - ucx>=1.13.0 -- ucx-py=0.27.* +- ucx-py=0.28.* - ucx-proc=*=gpu - doxygen>=1.8.20 - libfaiss>=1.7.0 @@ -27,8 +30,8 @@ dependencies: - pip: - sphinx_markdown_tables - breathe - - git+https://github.com/dask/dask.git@2022.7.1 - - git+https://github.com/dask/distributed.git@2022.7.1 + - git+https://github.com/dask/dask.git@2022.9.2 + - git+https://github.com/dask/distributed.git@2022.9.2 # rapids-build-env, notebook-env and doc-env are defined in # https://docs.rapids.ai/maintainers/depmgmt/ diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml index 21e3e1ff33..74ee0366ca 100644 --- a/conda/environments/raft_dev_cuda11.4.yml +++ b/conda/environments/raft_dev_cuda11.4.yml @@ -5,19 +5,22 @@ channels: - rapidsai-nightly - conda-forge dependencies: +- c-compiler +- cxx-compiler - cudatoolkit=11.4 +- ninja - clang=11.1.0 - clang-tools=11.1.0 - cython>=0.29,<0.30 -- cmake>=3.20.1,!=3.23.0 +- cmake>=3.23.1 - scikit-build>=0.13.1 -- rapids-build-env=22.08.* -- rapids-notebook-env=22.08.* -- rapids-doc-env=22.08.* -- rmm=22.08.* -- dask-cuda=22.08.* +- rapids-build-env=22.10.* +- rapids-notebook-env=22.10.* +- rapids-doc-env=22.10.* +- rmm=22.10.* +- dask-cuda=22.10.* - ucx>=1.13.0 -- ucx-py=0.27.* +- ucx-py=0.28.* - ucx-proc=*=gpu - doxygen>=1.8.20 - libfaiss>=1.7.0 @@ -27,8 +30,8 @@ dependencies: - pip: - sphinx_markdown_tables - breathe - - git+https://github.com/dask/dask.git@2022.7.1 - - git+https://github.com/dask/distributed.git@2022.7.1 + - git+https://github.com/dask/dask.git@2022.9.2 + - git+https://github.com/dask/distributed.git@2022.9.2 # rapids-build-env, notebook-env and doc-env are defined in # https://docs.rapids.ai/maintainers/depmgmt/ diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml index 49725eb39f..fca6684bc8 100644 --- a/conda/environments/raft_dev_cuda11.5.yml +++ b/conda/environments/raft_dev_cuda11.5.yml @@ -5,20 +5,23 @@ channels: - rapidsai-nightly - conda-forge dependencies: +- c-compiler +- cxx-compiler - cudatoolkit=11.5 - cuda-python >=11.5,<11.7.1 +- ninja - clang=11.1.0 - clang-tools=11.1.0 - cython>=0.29,<0.30 -- cmake>=3.20.1,!=3.23.0 +- cmake>=3.23.1 - scikit-build>=0.13.1 -- rapids-build-env=22.08.* -- rapids-notebook-env=22.08.* -- rapids-doc-env=22.08.* -- rmm=22.08.* -- dask-cuda=22.08.* +- rapids-build-env=22.10.* +- rapids-notebook-env=22.10.* +- rapids-doc-env=22.10.* +- rmm=22.10.* +- dask-cuda=22.10.* - ucx>=1.13.0 -- ucx-py=0.27.* +- ucx-py=0.28.* - ucx-proc=*=gpu - doxygen>=1.8.20 - libfaiss>=1.7.0 @@ -28,8 +31,8 @@ dependencies: - pip: - sphinx_markdown_tables - breathe - - git+https://github.com/dask/dask.git@2022.7.1 - - git+https://github.com/dask/distributed.git@2022.7.1 + - git+https://github.com/dask/dask.git@2022.9.2 + - git+https://github.com/dask/distributed.git@2022.9.2 # rapids-build-env, notebook-env and doc-env are defined in # https://docs.rapids.ai/maintainers/depmgmt/ diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml index bed95d14b3..c4d0c2a087 100644 --- a/conda/recipes/libraft/conda_build_config.yaml +++ b/conda/recipes/libraft/conda_build_config.yaml @@ -11,7 +11,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.20.1,!=3.23.0" + - ">=3.23.1" nccl_version: - ">=2.9.9" diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml index 5c2fa69f8e..725c38cb6a 100644 --- a/conda/recipes/pylibraft/conda_build_config.yaml +++ b/conda/recipes/pylibraft/conda_build_config.yaml @@ -11,4 +11,4 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.20.1,!=3.23.0" + - ">=3.23.1" diff --git a/conda/recipes/pyraft/build.sh b/conda/recipes/raft-dask/build.sh similarity index 81% rename from conda/recipes/pyraft/build.sh rename to conda/recipes/raft-dask/build.sh index 1462f365ff..963433dd8d 100644 --- a/conda/recipes/pyraft/build.sh +++ b/conda/recipes/raft-dask/build.sh @@ -3,4 +3,4 @@ # Copyright (c) 2022, NVIDIA CORPORATION. # This assumes the script is executed from the root of the repo directory -./build.sh pyraft --install --no-nvtx +./build.sh raft-dask --install --no-nvtx diff --git a/conda/recipes/pyraft/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml similarity index 86% rename from conda/recipes/pyraft/conda_build_config.yaml rename to conda/recipes/raft-dask/conda_build_config.yaml index 1ff86d58da..a6ca533504 100644 --- a/conda/recipes/pyraft/conda_build_config.yaml +++ b/conda/recipes/raft-dask/conda_build_config.yaml @@ -14,4 +14,4 @@ ucx_version: - "1.13.0" cmake_version: - - ">=3.20.1,!=3.23.0" + - ">=3.23.1" diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/raft-dask/meta.yaml similarity index 90% rename from conda/recipes/pyraft/meta.yaml rename to conda/recipes/raft-dask/meta.yaml index 7a2e8d6c49..4e10294db7 100644 --- a/conda/recipes/pyraft/meta.yaml +++ b/conda/recipes/raft-dask/meta.yaml @@ -10,7 +10,7 @@ {% set ucx_py_version=environ.get('UCX_PY_VERSION') %} package: - name: pyraft + name: raft-dask version: {{ version }} source: @@ -35,7 +35,7 @@ requirements: - cython>=0.29,<0.30 - scikit-build>=0.13.1 - rmm {{ minor_version }} - - libraft-headers {{ version }} + - pylibraft {{ version }} - cudatoolkit {{ cuda_version }}.* - cuda-python >=11.5,<11.7.1 - nccl>=2.9.9 @@ -45,14 +45,14 @@ requirements: run: - python x.x - dask-cuda {{ minor_version }} - - libraft-headers {{ version }} + - pylibraft {{ version }} - nccl>=2.9.9 - rmm {{ minor_version }} - ucx >={{ ucx_version }} - ucx-py {{ ucx_py_version }} - ucx-proc=*=gpu - - dask==2022.7.1 - - distributed==2022.7.1 + - dask==2022.9.2 + - distributed==2022.9.2 - cuda-python >=11.5,<11.7.1 - joblib >=0.11 - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} @@ -61,10 +61,10 @@ tests: # [linux64] requirements: # [linux64] - cudatoolkit {{ cuda_version }}.* # [linux64] imports: # [linux64] - - raft # [linux64] + - raft_dask # [linux64] about: home: http://rapids.ai/ license: Apache-2.0 # license_file: LICENSE - summary: pyraft library + summary: raft-dask library diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2c424e9431..ce6eb00bc1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -set(RAPIDS_VERSION "22.06") -set(RAFT_VERSION "${RAPIDS_VERSION}.00") +set(RAPIDS_VERSION "22.10") +set(RAFT_VERSION "22.10.00") -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../fetch_rapids.cmake) include(rapids-cmake) include(rapids-cpm) @@ -26,7 +26,7 @@ include(rapids-find) rapids_cuda_init_architectures(RAFT) -project(RAFT VERSION 22.08.00 LANGUAGES CXX CUDA) +project(RAFT VERSION ${RAFT_VERSION} LANGUAGES CXX CUDA) # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to # have different values for the `Threads::Threads` target. Setting this flag ensures @@ -53,7 +53,7 @@ option(BUILD_TESTS "Build raft unit-tests" ON) option(BUILD_BENCH "Build raft C++ benchmark tests" OFF) option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF) option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF) -option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) +option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF) option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON) option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON) option(DISABLE_OPENMP "Disable OpenMP" OFF) @@ -85,7 +85,7 @@ message(VERBOSE "RAFT: Disable OpenMP: ${DISABLE_OPENMP}") message(VERBOSE "RAFT: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELINFO}") message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}") message(VERBOSE "RAFT: Enable nvtx markers: ${RAFT_NVTX}") -message(VERBOSE "RAFT: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") +message(VERBOSE "RAFT: Statically link the CUDA toolkit runtime and libraries: ${CUDA_STATIC_RUNTIME}") # Set RMM logging level set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.") @@ -106,6 +106,21 @@ endif() ############################################################################## # - compiler options --------------------------------------------------------- +set(_ctk_static_suffix "") +if(CUDA_STATIC_RUNTIME) + # If we're statically linking CTK cuBLAS, + # we also want to statically link BLAS + set(BLA_STATIC ON) + set(_ctk_static_suffix "_static") + # Control legacy FindCUDA.cmake behavior too + # Remove this after we push it into rapids-cmake: + # https://github.com/rapidsai/rapids-cmake/pull/259 + set(CUDA_USE_STATIC_CUDA_RUNTIME ON) +endif() + +# CUDA runtime +rapids_cuda_init_runtime(USE_STATIC ${CUDA_STATIC_RUNTIME}) + if (NOT DISABLE_OPENMP) find_package(OpenMP) if(OPENMP_FOUND) @@ -168,12 +183,11 @@ target_include_directories(raft INTERFACE # Only CUDA libs and rmm should # be used in global target. target_link_libraries(raft INTERFACE - CUDA::cublas - CUDA::curand - CUDA::cusolver - CUDA::cudart - CUDA::cusparse rmm::rmm + CUDA::cublas${_ctk_static_suffix} + CUDA::curand${_ctk_static_suffix} + CUDA::cusolver${_ctk_static_suffix} + CUDA::cusparse${_ctk_static_suffix} $<$:raft::Thrust> ) @@ -214,6 +228,11 @@ endif() ############################################################################## # - raft_distance ------------------------------------------------------------ +# TODO: +# Currently, this package also contains the 'random' namespace (for rmat logic) +# We couldn't get this to work properly due to strange CI failures as noticed +# in the PR#778. In the long term, we should rename this package to `raft_compiled` +# in order to have a single pre-compiled raft package for those who need it. add_library(raft_distance INTERFACE) if(TARGET raft_distance AND (NOT TARGET raft::distance)) @@ -255,6 +274,17 @@ if(RAFT_COMPILE_DIST_LIBRARY) src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu + src/distance/specializations/detail/russel_rao_double_double_double_int.cu + src/distance/specializations/detail/russel_rao_float_float_float_uint32.cu + src/distance/specializations/detail/russel_rao_float_float_float_int.cu +# src/distance/specializations/fused_l2_nn_double_int.cu +# src/distance/specializations/fused_l2_nn_double_int64.cu +# src/distance/specializations/fused_l2_nn_float_int.cu +# src/distance/specializations/fused_l2_nn_float_int64.cu + src/random/specializations/rmat_rectangular_generator_int_double.cu + src/random/specializations/rmat_rectangular_generator_int64_double.cu + src/random/specializations/rmat_rectangular_generator_int_float.cu + src/random/specializations/rmat_rectangular_generator_int64_float.cu ) set_target_properties( raft_distance_lib @@ -310,6 +340,21 @@ if(RAFT_COMPILE_NN_LIBRARY) src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu + src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu + src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu + src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu + src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu + src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu + src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu + src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu + src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu + src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu + src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu + src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu + src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu + src/nn/specializations/detail/ivfpq_search_float_int64_t.cu + src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu + src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu src/nn/specializations/fused_l2_knn_long_float_true.cu src/nn/specializations/fused_l2_knn_long_float_false.cu src/nn/specializations/fused_l2_knn_int_float_true.cu diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index 6b2d463d0e..51170e4265 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -14,63 +14,122 @@ # limitations under the License. #============================================================================= -set(RAFT_CPP_BENCH_TARGET "bench_raft") - -# (please keep the filenames in alphabetical order) -add_executable(${RAFT_CPP_BENCH_TARGET} - bench/distance/distance_cosine.cu - bench/distance/distance_exp_l2.cu - bench/distance/distance_l1.cu - bench/distance/distance_unexp_l2.cu - bench/linalg/add.cu - bench/linalg/map_then_reduce.cu - bench/linalg/matrix_vector_op.cu - bench/linalg/reduce.cu - bench/random/make_blobs.cu - bench/random/permute.cu - bench/random/rng.cu - bench/sparse/convert_csr.cu - bench/spatial/fused_l2_nn.cu - bench/spatial/knn.cu - bench/spatial/selection.cu - bench/main.cpp -) - -set_target_properties(${RAFT_CPP_BENCH_TARGET} - PROPERTIES BUILD_RPATH "\$ORIGIN" - # set target compile options - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - POSITION_INDEPENDENT_CODE ON - INTERFACE_POSITION_INDEPENDENT_CODE ON - INSTALL_RPATH "\$ORIGIN/../../../lib" -) - -target_compile_options(${RAFT_CPP_BENCH_TARGET} - PRIVATE "$<$:${RAFT_CXX_FLAGS}>" - "$<$:${RAFT_CUDA_FLAGS}>" -) - -target_include_directories(${RAFT_CPP_BENCH_TARGET} - PUBLIC "$" -) - -target_link_libraries(${RAFT_CPP_BENCH_TARGET} - PRIVATE - raft::raft - raft::distance - raft::nn - faiss::faiss - benchmark::benchmark - $ - $ -) +################################################################################################### +# - compiler function ----------------------------------------------------------------------------- + +function(ConfigureBench) + +set(options OPTIONAL DIST NN) +set(oneValueArgs NAME ) +set(multiValueArgs PATH TARGETS CONFIGURATIONS) + +cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + +set(BENCH_NAME ${ConfigureBench_NAME}) + +add_executable(${BENCH_NAME} ${ConfigureBench_PATH}) + +message("BENCH PATH: ${ConfigureBench_PATH}") + +target_link_libraries(${BENCH_NAME} + PRIVATE + raft::raft + $<$:raft::distance> + $<$:raft::nn> + benchmark::benchmark + Threads::Threads + $ + $ + ) + +set_target_properties(${BENCH_NAME} + PROPERTIES + # set target compile options + INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + ) + +target_compile_options(${BENCH_NAME} + PRIVATE "$<$:${RAFT_CXX_FLAGS}>" + "$<$:${RAFT_CUDA_FLAGS}>" + ) + +target_include_directories(${BENCH_NAME} + PUBLIC "$" + ) install( - TARGETS ${RAFT_CPP_BENCH_TARGET} - COMPONENT testing - DESTINATION bin/libraft/gbench - EXCLUDE_FROM_ALL -) + TARGETS ${BENCH_NAME} + COMPONENT testing + DESTINATION bin/gbench/libraft + EXCLUDE_FROM_ALL) + +endfunction() + +if(BUILD_BENCH) + ConfigureBench(NAME CLUSTER_BENCH + PATH + bench/cluster/kmeans_balanced.cu + bench/cluster/kmeans.cu + bench/main.cpp + OPTIONAL DIST NN + ) + + ConfigureBench(NAME DISTANCE_BENCH + PATH + bench/distance/distance_cosine.cu + bench/distance/distance_exp_l2.cu + bench/distance/distance_l1.cu + bench/distance/distance_unexp_l2.cu + bench/main.cpp + OPTIONAL DIST + ) + + ConfigureBench(NAME LINALG_BENCH + PATH + bench/linalg/add.cu + bench/linalg/map_then_reduce.cu + bench/linalg/matrix_vector_op.cu + bench/linalg/reduce.cu + bench/main.cpp + ) + + ConfigureBench(NAME RANDOM_BENCH + PATH + bench/random/make_blobs.cu + bench/random/permute.cu + bench/random/rng.cu + bench/main.cpp + ) + + ConfigureBench(NAME SPARSE_BENCH + PATH + bench/sparse/convert_csr.cu + bench/main.cpp + ) + + ConfigureBench(NAME SPATIAL_BENCH + PATH + bench/spatial/fused_l2_nn.cu + bench/spatial/knn/brute_force_float_int64_t.cu + bench/spatial/knn/brute_force_float_uint32_t.cu + bench/spatial/knn/ivf_flat_float_int64_t.cu + bench/spatial/knn/ivf_flat_float_uint32_t.cu + bench/spatial/knn/ivf_flat_int8_t_int64_t.cu + bench/spatial/knn/ivf_flat_uint8_t_uint32_t.cu + bench/spatial/knn/ivf_pq_float_int64_t.cu + bench/spatial/knn/ivf_pq_float_uint32_t.cu + bench/spatial/knn/ivf_pq_int8_t_int64_t.cu + bench/spatial/knn/ivf_pq_uint8_t_uint32_t.cu + bench/spatial/selection.cu + bench/main.cpp + OPTIONAL DIST NN + ) +endif() + diff --git a/cpp/bench/cluster/kmeans.cu b/cpp/bench/cluster/kmeans.cu new file mode 100644 index 0000000000..bf4cc2f686 --- /dev/null +++ b/cpp/bench/cluster/kmeans.cu @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED +#include +#endif + +namespace raft::bench::cluster { + +struct KMeansBenchParams { + DatasetParams data; + BlobsParams blobs; + raft::cluster::KMeansParams kmeans; +}; + +template +struct KMeans : public BlobsFixture { + KMeans(const KMeansBenchParams& p) : BlobsFixture(p.data, p.blobs), params(p) {} + + void run_benchmark(::benchmark::State& state) override + { + raft::device_matrix_view X_view = this->X.view(); + std::optional> opt_weights_view = std::nullopt; + std::optional> centroids_view = + std::make_optional>(centroids.view()); + raft::device_vector_view labels_view = labels.view(); + raft::host_scalar_view inertia_view = raft::make_host_scalar_view(&inertia); + raft::host_scalar_view n_iter_view = raft::make_host_scalar_view(&n_iter); + + this->loop_on_state(state, [&]() { + raft::cluster::kmeans_fit_predict(this->handle, + params.kmeans, + X_view, + opt_weights_view, + centroids_view, + labels_view, + inertia_view, + n_iter_view); + }); + } + + void allocate_temp_buffers(const ::benchmark::State& state) override + { + centroids = + raft::make_device_matrix(this->handle, params.kmeans.n_clusters, params.data.cols); + labels = raft::make_device_vector(this->handle, params.data.rows); + } + + private: + KMeansBenchParams params; + raft::device_matrix centroids; + raft::device_vector labels; + T inertia; + IndexT n_iter; +}; // struct KMeans + +std::vector getKMeansInputs() +{ + std::vector out; + KMeansBenchParams p; + p.data.row_major = true; + p.blobs.cluster_std = 1.0; + p.blobs.shuffle = false; + p.blobs.center_box_min = -10.0; + p.blobs.center_box_max = 10.0; + p.blobs.seed = 12345ULL; + p.kmeans.init = raft::cluster::KMeansParams::KMeansPlusPlus; + p.kmeans.max_iter = 300; + p.kmeans.tol = 1e-4; + p.kmeans.verbosity = RAFT_LEVEL_INFO; + p.kmeans.metric = raft::distance::DistanceType::L2Expanded; + p.kmeans.inertia_check = true; + std::vector> row_cols_k = { + {1000000, 20, 1000}, + {3000000, 50, 20}, + {10000000, 50, 5}, + }; + for (auto& rck : row_cols_k) { + p.data.rows = std::get<0>(rck); + p.data.cols = std::get<1>(rck); + p.blobs.n_clusters = std::get<2>(rck); + p.kmeans.n_clusters = std::get<2>(rck); + for (auto bs_shift : std::vector({16, 18})) { + p.kmeans.batch_samples = 1 << bs_shift; + out.push_back(p); + } + } + return out; +} + +// note(lsugy): commenting out int64_t because the templates are not compiled in the distance +// library, resulting in long compilation times. +RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); +RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); +// RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); +// RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); + +} // namespace raft::bench::cluster diff --git a/cpp/bench/cluster/kmeans_balanced.cu b/cpp/bench/cluster/kmeans_balanced.cu new file mode 100644 index 0000000000..210b40ced8 --- /dev/null +++ b/cpp/bench/cluster/kmeans_balanced.cu @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED +#include +#endif + +namespace raft::bench::cluster { + +struct KMeansBalancedBenchParams { + DatasetParams data; + uint32_t max_iter; + uint32_t n_lists; + raft::distance::DistanceType metric; +}; + +template +struct KMeansBalanced : public fixture { + KMeansBalanced(const KMeansBalancedBenchParams& p) : params(p) {} + + void run_benchmark(::benchmark::State& state) override + { + this->loop_on_state(state, [this]() { + raft::spatial::knn::detail::kmeans::build_hierarchical(this->handle, + this->params.max_iter, + (uint32_t)this->params.data.cols, + this->X.data_handle(), + this->params.data.rows, + this->centroids.data_handle(), + this->params.n_lists, + this->params.metric, + this->handle.get_stream()); + }); + } + + void allocate_data(const ::benchmark::State& state) override + { + X = raft::make_device_matrix(handle, params.data.rows, params.data.cols); + + raft::random::RngState rng{1234}; + constexpr T kRangeMax = std::is_integral_v ? std::numeric_limits::max() : T(1); + constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); + if constexpr (std::is_integral_v) { + raft::random::uniformInt( + rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax, stream); + } else { + raft::random::uniform( + rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax, stream); + } + handle.sync_stream(stream); + } + + void allocate_temp_buffers(const ::benchmark::State& state) override + { + centroids = + raft::make_device_matrix(this->handle, params.n_lists, params.data.cols); + } + + private: + KMeansBalancedBenchParams params; + raft::device_matrix X; + raft::device_matrix centroids; +}; // struct KMeansBalanced + +std::vector getKMeansBalancedInputs() +{ + std::vector out; + KMeansBalancedBenchParams p; + p.data.row_major = true; + p.max_iter = 20; + p.metric = raft::distance::DistanceType::L2Expanded; + std::vector> row_cols = { + {100000, 128}, {1000000, 128}, {10000000, 128}, + // The following dataset sizes are too large for most GPUs. + // {100000000, 128}, + }; + for (auto& rc : row_cols) { + p.data.rows = rc.first; + p.data.cols = rc.second; + for (auto n_lists : std::vector({1000, 10000, 100000})) { + p.n_lists = n_lists; + out.push_back(p); + } + } + return out; +} + +// Note: the datasets sizes are too large for 32-bit index types. +RAFT_BENCH_REGISTER((KMeansBalanced), "", getKMeansBalancedInputs()); +RAFT_BENCH_REGISTER((KMeansBalanced), "", getKMeansBalancedInputs()); +RAFT_BENCH_REGISTER((KMeansBalanced), "", getKMeansBalancedInputs()); + +} // namespace raft::bench::cluster diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp index fb878a0c8d..adfe5218e2 100644 --- a/cpp/bench/common/benchmark.hpp +++ b/cpp/bench/common/benchmark.hpp @@ -18,9 +18,12 @@ #include +#include +#include #include #include #include +#include #include @@ -121,6 +124,10 @@ class fixture { // every benchmark should be overriding this virtual void run_benchmark(::benchmark::State& state) = 0; virtual void generate_metrics(::benchmark::State& state) {} + virtual void allocate_data(const ::benchmark::State& state) {} + virtual void deallocate_data(const ::benchmark::State& state) {} + virtual void allocate_temp_buffers(const ::benchmark::State& state) {} + virtual void deallocate_temp_buffers(const ::benchmark::State& state) {} protected: /** The helper that writes zeroes to some buffer in GPU memory to flush the L2 cache. */ @@ -144,6 +151,58 @@ class fixture { } }; +/** Indicates the dataset size. */ +struct DatasetParams { + size_t rows; + size_t cols; + bool row_major; +}; + +/** Holds params needed to generate blobs dataset */ +struct BlobsParams { + int n_clusters; + double cluster_std; + bool shuffle; + double center_box_min, center_box_max; + uint64_t seed; +}; + +/** Fixture for cluster benchmarks using make_blobs */ +template +class BlobsFixture : public fixture { + public: + BlobsFixture(const DatasetParams dp, const BlobsParams bp) : data_params(dp), blobs_params(bp) {} + + virtual void run_benchmark(::benchmark::State& state) = 0; + + void allocate_data(const ::benchmark::State& state) override + { + auto labels_ref = raft::make_device_vector(this->handle, data_params.rows); + X = raft::make_device_matrix(this->handle, data_params.rows, data_params.cols); + + raft::random::make_blobs(X.data_handle(), + labels_ref.data_handle(), + (IndexT)data_params.rows, + (IndexT)data_params.cols, + (IndexT)blobs_params.n_clusters, + stream, + data_params.row_major, + nullptr, + nullptr, + (T)blobs_params.cluster_std, + blobs_params.shuffle, + (T)blobs_params.center_box_min, + (T)blobs_params.center_box_max, + blobs_params.seed); + this->handle.sync_stream(stream); + } + + protected: + DatasetParams data_params; + BlobsParams blobs_params; + raft::device_matrix X; +}; + namespace internal { template @@ -162,8 +221,17 @@ class Fixture : public ::benchmark::Fixture { { fixture_ = std::apply([](const Params&... ps) { return std::make_unique(ps...); }, params_); + fixture_->allocate_data(state); + fixture_->allocate_temp_buffers(state); + } + + void TearDown(const State& state) override + { + fixture_->deallocate_temp_buffers(state); + fixture_->deallocate_data(state); + fixture_.reset(); } - void TearDown(const State& state) override { fixture_.reset(); } + void SetUp(State& st) override { SetUp(const_cast(st)); } void TearDown(State& st) override { TearDown(const_cast(st)); } @@ -248,6 +316,10 @@ struct registrar { }; // namespace internal +#define RAFT_BENCH_REGISTER_INTERNAL(TestClass, ...) \ + static raft::bench::internal::registrar BENCHMARK_PRIVATE_NAME(registrar)( \ + RAFT_STRINGIFY(TestClass), __VA_ARGS__) + /** * This is the entry point macro for all benchmarks. This needs to be called * for the set of benchmarks to be registered so that the main harness inside @@ -262,8 +334,7 @@ struct registrar { * empty string * @param params... zero or more lists of params upon which to benchmark. */ -#define RAFT_BENCH_REGISTER(TestClass, ...) \ - static raft::bench::internal::registrar BENCHMARK_PRIVATE_NAME(registrar)( \ - #TestClass, __VA_ARGS__) +#define RAFT_BENCH_REGISTER(TestClass, ...) \ + RAFT_BENCH_REGISTER_INTERNAL(RAFT_DEPAREN(TestClass), __VA_ARGS__) } // namespace raft::bench diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/distance/distance_common.cuh index dae2550326..4f1a8ccab1 100644 --- a/cpp/bench/distance/distance_common.cuh +++ b/cpp/bench/distance/distance_common.cuh @@ -16,9 +16,9 @@ #include #include -#include +#include #if defined RAFT_DISTANCE_COMPILED -#include +#include #endif #include diff --git a/cpp/bench/linalg/add.cu b/cpp/bench/linalg/add.cu index 7c651b61ed..7d00b8cbae 100644 --- a/cpp/bench/linalg/add.cu +++ b/cpp/bench/linalg/add.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include namespace raft::bench::linalg { diff --git a/cpp/bench/linalg/map_then_reduce.cu b/cpp/bench/linalg/map_then_reduce.cu index 7eeb4a79b6..33a3e66264 100644 --- a/cpp/bench/linalg/map_then_reduce.cu +++ b/cpp/bench/linalg/map_then_reduce.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include namespace raft::bench::linalg { diff --git a/cpp/bench/linalg/matrix_vector_op.cu b/cpp/bench/linalg/matrix_vector_op.cu index d3a53ea345..aa8f2667ed 100644 --- a/cpp/bench/linalg/matrix_vector_op.cu +++ b/cpp/bench/linalg/matrix_vector_op.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include namespace raft::bench::linalg { diff --git a/cpp/bench/linalg/reduce.cu b/cpp/bench/linalg/reduce.cu index 018086a689..015e0b8abe 100644 --- a/cpp/bench/linalg/reduce.cu +++ b/cpp/bench/linalg/reduce.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include diff --git a/cpp/bench/random/make_blobs.cu b/cpp/bench/random/make_blobs.cu index c449223040..fdd4ef61d2 100644 --- a/cpp/bench/random/make_blobs.cu +++ b/cpp/bench/random/make_blobs.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include diff --git a/cpp/bench/random/permute.cu b/cpp/bench/random/permute.cu index a72eca3f87..5364bb44e3 100644 --- a/cpp/bench/random/permute.cu +++ b/cpp/bench/random/permute.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/sparse/convert_csr.cu index 0e701518ab..830fab13cc 100644 --- a/cpp/bench/sparse/convert_csr.cu +++ b/cpp/bench/sparse/convert_csr.cu @@ -14,8 +14,6 @@ * limitations under the License. */ -#include -#include #include #include diff --git a/cpp/bench/spatial/fused_l2_nn.cu b/cpp/bench/spatial/fused_l2_nn.cu index dc3b507fbf..aa36483145 100644 --- a/cpp/bench/spatial/fused_l2_nn.cu +++ b/cpp/bench/spatial/fused_l2_nn.cu @@ -17,14 +17,17 @@ #include #include #include -#include +#include #include -#include +#include #include -#if defined RAFT_NN_COMPILED -#include -#endif +// TODO: Once fusedL2NN is specialized in the raft_distance shared library, add +// back +// +// #if defined RAFT_NN_COMPILED +// #include +// #endif namespace raft::bench::spatial { @@ -73,6 +76,30 @@ struct fused_l2_nn : public fixture { false, stream); }); + + // Num distance calculations + int64_t num_dist_calcs = (int64_t)params.n * (int64_t)params.m; + + int64_t num_flops = 3 * num_dist_calcs * params.k; + + int64_t read_elts = (int64_t)params.n * params.k + (int64_t)params.m * params.k; + int64_t write_elts = (int64_t)params.n; + + state.counters["D/s"] = benchmark::Counter(num_dist_calcs, + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + + state.counters["FLOP/s"] = benchmark::Counter( + num_flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); + + state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(cub::KeyValuePair), + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(float), + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + + state.counters["K"] = benchmark::Counter(params.k); } private: @@ -88,9 +115,9 @@ const std::vector fused_l2_nn_input_vecs = { {32, 16384, 16384}, {64, 16384, 16384}, {128, 16384, 16384}, {256, 16384, 16384}, {512, 16384, 16384}, {1024, 16384, 16384}, {16384, 32, 16384}, {16384, 64, 16384}, {16384, 128, 16384}, {16384, 256, 16384}, {16384, 512, 16384}, {16384, 1024, 16384}, + {16384, 16384, 2}, {16384, 16384, 4}, {16384, 16384, 8}, {16384, 16384, 16}, {16384, 16384, 32}, {16384, 16384, 64}, {16384, 16384, 128}, {16384, 16384, 256}, {16384, 16384, 512}, {16384, 16384, 1024}, {16384, 16384, 16384}, - }; RAFT_BENCH_REGISTER(fused_l2_nn, "", fused_l2_nn_input_vecs); diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cuh similarity index 85% rename from cpp/bench/spatial/knn.cu rename to cpp/bench/spatial/knn.cuh index 64a1217d7f..bb01320cdf 100644 --- a/cpp/bench/spatial/knn.cu +++ b/cpp/bench/spatial/knn.cuh @@ -14,13 +14,25 @@ * limitations under the License. */ +#pragma once + #include #include #include +#include +#include + +#if defined RAFT_DISTANCE_COMPILED +#include +#endif + #if defined RAFT_NN_COMPILED #include +#if defined RAFT_DISTANCE_COMPILED +#include +#endif #endif #include @@ -44,16 +56,16 @@ struct params { size_t k; }; -auto operator<<(std::ostream& os, const params& p) -> std::ostream& +inline auto operator<<(std::ostream& os, const params& p) -> std::ostream& { os << p.n_samples << "#" << p.n_dims << "#" << p.n_queries << "#" << p.k; return os; } -enum class TransferStrategy { NO_COPY, COPY_PLAIN, COPY_PINNED, MAP_PINNED, MANAGED }; -enum class Scope { BUILD, SEARCH, BUILD_SEARCH }; +enum class TransferStrategy { NO_COPY, COPY_PLAIN, COPY_PINNED, MAP_PINNED, MANAGED }; // NOLINT +enum class Scope { BUILD, SEARCH, BUILD_SEARCH }; // NOLINT -auto operator<<(std::ostream& os, const TransferStrategy& ts) -> std::ostream& +inline auto operator<<(std::ostream& os, const TransferStrategy& ts) -> std::ostream& { switch (ts) { case TransferStrategy::NO_COPY: os << "NO_COPY"; break; @@ -66,7 +78,7 @@ auto operator<<(std::ostream& os, const TransferStrategy& ts) -> std::ostream& return os; } -auto operator<<(std::ostream& os, const Scope& s) -> std::ostream& +inline auto operator<<(std::ostream& os, const Scope& s) -> std::ostream& { switch (s) { case Scope::BUILD: os << "BUILD"; break; @@ -155,6 +167,34 @@ struct ivf_flat_knn { } }; +template +struct ivf_pq_knn { + using dist_t = float; + + std::optional> index; + raft::spatial::knn::ivf_pq::index_params index_params; + raft::spatial::knn::ivf_pq::search_params search_params; + params ps; + + ivf_pq_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps) + { + index_params.n_lists = 4096; + index_params.metric = raft::distance::DistanceType::L2Expanded; + index.emplace(raft::spatial::knn::ivf_pq::build( + handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims))); + } + + void search(const raft::handle_t& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + search_params.n_probes = 20; + raft::spatial::knn::ivf_pq::search( + handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists); + } +}; + template struct brute_force_knn { using dist_t = ValT; @@ -216,7 +256,7 @@ struct knn : public fixture { } template - void gen_data(raft::random::RngState& state, + void gen_data(raft::random::RngState& state, // NOLINT rmm::device_uvector& vec, size_t n, rmm::cuda_stream_view stream) @@ -337,15 +377,15 @@ struct knn : public fixture { rmm::device_uvector out_idxs_; }; -const std::vector kInputs{ +inline const std::vector kInputs{ {2000000, 128, 1000, 32}, {10000000, 128, 1000, 32}, {10000, 8192, 1000, 32}}; -const std::vector kAllStrategies{ +inline const std::vector kAllStrategies{ TransferStrategy::NO_COPY, TransferStrategy::MAP_PINNED, TransferStrategy::MANAGED}; -const std::vector kNoCopyOnly{TransferStrategy::NO_COPY}; +inline const std::vector kNoCopyOnly{TransferStrategy::NO_COPY}; -const std::vector kScopeFull{Scope::BUILD_SEARCH}; -const std::vector kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::BUILD}; +inline const std::vector kScopeFull{Scope::BUILD_SEARCH}; +inline const std::vector kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::BUILD}; #define KNN_REGISTER(ValT, IdxT, ImplT, inputs, strats, scope) \ namespace BENCHMARK_PRIVATE_NAME(knn) \ @@ -354,14 +394,4 @@ const std::vector kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::B RAFT_BENCH_REGISTER(KNN, #ValT "/" #IdxT "/" #ImplT, inputs, strats, scope); \ } -KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull); -KNN_REGISTER(float, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); -KNN_REGISTER(int8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); -KNN_REGISTER(uint8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); - -KNN_REGISTER(float, uint32_t, brute_force_knn, kInputs, kNoCopyOnly, kScopeFull); -KNN_REGISTER(float, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); -KNN_REGISTER(int8_t, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); -KNN_REGISTER(uint8_t, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); - } // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/brute_force_float_int64_t.cu b/cpp/bench/spatial/knn/brute_force_float_int64_t.cu new file mode 100644 index 0000000000..d981104e20 --- /dev/null +++ b/cpp/bench/spatial/knn/brute_force_float_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/brute_force_float_uint32_t.cu b/cpp/bench/spatial/knn/brute_force_float_uint32_t.cu new file mode 100644 index 0000000000..60f7edae96 --- /dev/null +++ b/cpp/bench/spatial/knn/brute_force_float_uint32_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, uint32_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/ivf_flat_float_int64_t.cu b/cpp/bench/spatial/knn/ivf_flat_float_int64_t.cu new file mode 100644 index 0000000000..594d4d16d2 --- /dev/null +++ b/cpp/bench/spatial/knn/ivf_flat_float_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/ivf_flat_float_uint32_t.cu b/cpp/bench/spatial/knn/ivf_flat_float_uint32_t.cu new file mode 100644 index 0000000000..595ad2b922 --- /dev/null +++ b/cpp/bench/spatial/knn/ivf_flat_float_uint32_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/ivf_flat_int8_t_int64_t.cu b/cpp/bench/spatial/knn/ivf_flat_int8_t_int64_t.cu new file mode 100644 index 0000000000..bd268f036c --- /dev/null +++ b/cpp/bench/spatial/knn/ivf_flat_int8_t_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(int8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/ivf_flat_uint8_t_uint32_t.cu b/cpp/bench/spatial/knn/ivf_flat_uint8_t_uint32_t.cu new file mode 100644 index 0000000000..9d8b982c3e --- /dev/null +++ b/cpp/bench/spatial/knn/ivf_flat_uint8_t_uint32_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(uint8_t, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/ivf_pq_float_int64_t.cu b/cpp/bench/spatial/knn/ivf_pq_float_int64_t.cu new file mode 100644 index 0000000000..18d8cd8ad6 --- /dev/null +++ b/cpp/bench/spatial/knn/ivf_pq_float_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, ivf_pq_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/ivf_pq_float_uint32_t.cu b/cpp/bench/spatial/knn/ivf_pq_float_uint32_t.cu new file mode 100644 index 0000000000..81621674bf --- /dev/null +++ b/cpp/bench/spatial/knn/ivf_pq_float_uint32_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, uint32_t, ivf_pq_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/ivf_pq_int8_t_int64_t.cu b/cpp/bench/spatial/knn/ivf_pq_int8_t_int64_t.cu new file mode 100644 index 0000000000..cc28eee67c --- /dev/null +++ b/cpp/bench/spatial/knn/ivf_pq_int8_t_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(int8_t, int64_t, ivf_pq_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/knn/ivf_pq_uint8_t_uint32_t.cu b/cpp/bench/spatial/knn/ivf_pq_uint8_t_uint32_t.cu new file mode 100644 index 0000000000..b4759cbac1 --- /dev/null +++ b/cpp/bench/spatial/knn/ivf_pq_uint8_t_uint32_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(uint8_t, uint32_t, ivf_pq_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu index c3a2bc6d3d..1f116c199f 100644 --- a/cpp/bench/spatial/selection.cu +++ b/cpp/bench/spatial/selection.cu @@ -18,7 +18,7 @@ #include #if defined RAFT_NN_COMPILED -#include +#include #endif #include diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake index f61ba7014c..e6f06a00a5 100644 --- a/cpp/cmake/thirdparty/get_faiss.cmake +++ b/cpp/cmake/thirdparty/get_faiss.cmake @@ -15,7 +15,7 @@ #============================================================================= function(find_and_configure_faiss) - set(oneValueArgs VERSION PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL) + set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL) cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} ) @@ -25,15 +25,16 @@ function(find_and_configure_faiss) LIBRARY_NAMES faiss ) - set(BUILD_SHARED_LIBS OFF) - if (NOT PKG_BUILD_STATIC_LIBS) - set(BUILD_SHARED_LIBS ON) + set(BUILD_SHARED_LIBS ON) + if (PKG_BUILD_STATIC_LIBS) + set(BUILD_SHARED_LIBS OFF) + set(CPM_DOWNLOAD_faiss ON) endif() rapids_cpm_find(faiss ${PKG_VERSION} GLOBAL_TARGETS faiss::faiss CPM_ARGS - GIT_REPOSITORY https://github.com/facebookresearch/faiss.git + GIT_REPOSITORY ${PKG_REPOSITORY} GIT_TAG ${PKG_PINNED_TAG} EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} OPTIONS @@ -42,6 +43,7 @@ function(find_and_configure_faiss) "FAISS_ENABLE_GPU ON" "BUILD_TESTING OFF" "CMAKE_MESSAGE_LOG_LEVEL VERBOSE" + "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}" ) if(TARGET faiss AND NOT TARGET faiss::faiss) @@ -66,7 +68,22 @@ function(find_and_configure_faiss) rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-nn-lib-exports) endfunction() +if(NOT RAFT_FAISS_GIT_TAG) + # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC + # (https://github.com/facebookresearch/faiss/pull/2446) + set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk-v1.7.0) + # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30) +endif() + +if(NOT RAFT_FAISS_GIT_REPOSITORY) + # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC + # (https://github.com/facebookresearch/faiss/pull/2446) + set(RAFT_FAISS_GIT_REPOSITORY https://github.com/trxcllnt/faiss.git) + # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git) +endif() + find_and_configure_faiss(VERSION 1.7.0 - PINNED_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30 + REPOSITORY ${RAFT_FAISS_GIT_REPOSITORY} + PINNED_TAG ${RAFT_FAISS_GIT_TAG} BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC} EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL}) diff --git a/cpp/doxygen/Doxyfile.in b/cpp/doxygen/Doxyfile.in index 6f29e79146..549862600a 100644 --- a/cpp/doxygen/Doxyfile.in +++ b/cpp/doxygen/Doxyfile.in @@ -880,7 +880,27 @@ RECURSIVE = YES # run. EXCLUDE = @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/linalg/symmetrize.hpp \ - \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/cache \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/common \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/lap \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/selection \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/csr.hpp \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/linalg/lanczos.cuh \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/linalg/lanczos.hpp \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/cuda_utils.cuh \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/cudart_utils.h \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/util/device_atomics.cuh \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/device_utils.cuh \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/error.hpp \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/handle.hpp \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/integer_utils.h \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/interruptible.hpp \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/mdarray.hpp \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/pow2_utils.cuh \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/span.hpp \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/vectorized.cuh \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/raft.hpp \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/core/cudart_utils.hpp # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp index b1b8255b7e..6a4f323c58 100644 --- a/cpp/include/raft.hpp +++ b/cpp/include/raft.hpp @@ -15,11 +15,12 @@ */ /** - * This file is deprecated and will be removed in release 22.06. + * This file is deprecated and will be removed in a future release. */ -#include "raft/handle.hpp" -#include "raft/mdarray.hpp" -#include "raft/span.hpp" +#include "raft/core/device_mdarray.hpp" +#include "raft/core/device_mdspan.hpp" +#include "raft/core/device_span.hpp" +#include "raft/core/handle.hpp" #include diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh index 3e2222eff1..60da09ca7c 100644 --- a/cpp/include/raft/cache/cache_util.cuh +++ b/cpp/include/raft/cache/cache_util.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,356 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -#include -#include - -namespace raft { -namespace cache { - -/** - * @brief Collect vectors of data from the cache into a contiguous memory buffer. - * - * We assume contiguous memory layout for the output buffer, i.e. we get - * column vectors into a column major out buffer, or row vectors into a row - * major output buffer. - * - * On exit, the output array is filled the following way: - * out[i + n_vec*k] = cache[i + n_vec * cache_idx[k]]), where i=0..n_vec-1, and - * k = 0..n-1 where cache_idx[k] >= 0 - * - * We ignore vectors where cache_idx[k] < 0. - * - * @param [in] cache stores the cached data, size [n_vec x n_cached_vectors] - * @param [in] n_vec number of elements in a cached vector - * @param [in] cache_idx cache indices, size [n] - * @param [in] n the number of elements that need to be collected - * @param [out] out vectors collected from the cache, size [n_vec * n] - */ -template -__global__ void get_vecs( - const math_t* cache, int_t n_vec, const idx_t* cache_idx, int_t n, math_t* out) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - int row = tid % n_vec; // row idx - if (tid < n_vec * n) { - size_t out_col = tid / n_vec; // col idx - size_t cache_col = cache_idx[out_col]; - if (cache_idx[out_col] >= 0) { - if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; } - } - } -} - -/** - * @brief Store vectors of data into the cache. - * - * Elements within a vector should be contiguous in memory (i.e. column vectors - * for column major data storage, or row vectors of row major data). - * - * If tile_idx==nullptr then the operation is the opposite of get_vecs, - * i.e. we store - * cache[i + cache_idx[k]*n_vec] = tile[i + k*n_vec], for i=0..n_vec-1, k=0..n-1 - * - * If tile_idx != nullptr, then we permute the vectors from tile according - * to tile_idx. This allows to store vectors from a buffer where the individual - * vectors are not stored contiguously (but the elements of each vector shall - * be contiguous): - * cache[i + cache_idx[k]*n_vec] = tile[i + tile_idx[k]*n_vec], - * for i=0..n_vec-1, k=0..n-1 - * - * @param [in] tile stores the data to be cashed cached, size [n_vec x n_tile] - * @param [in] n_tile number of vectors in the input tile - * @param [in] n_vec number of elements in a cached vector - * @param [in] tile_idx indices of vectors that we want to store - * @param [in] n number of vectos that we want to store (n <= n_tile) - * @param [in] cache_idx cache indices, size [n], negative values are ignored - * @param [inout] cache updated cache - * @param [in] n_cache_vecs - */ -template -__global__ void store_vecs(const math_t* tile, - int n_tile, - int n_vec, - const int* tile_idx, - int n, - const int* cache_idx, - math_t* cache, - int n_cache_vecs) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - int row = tid % n_vec; // row idx - if (tid < n_vec * n) { - int tile_col = tid / n_vec; // col idx - int data_col = tile_idx ? tile_idx[tile_col] : tile_col; - int cache_col = cache_idx[tile_col]; - - // We ignore negative values. The rest of the checks should be fulfilled - // if the cache is used properly - if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) { - cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec]; - } - } -} - -/** - * @brief Map a key to a cache set. - * - * @param key key to be hashed - * @param n_cache_sets number of cache sets - * @return index of the cache set [0..n_cache_set) - */ -int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; } - -/** - * @brief Binary search to find the first element in the array which is greater - * equal than a given value. - * @param [in] array sorted array of n numbers - * @param [in] n length of the array - * @param [in] val the value to search for - * @return the index of the first element in the array for which - * array[idx] >= value. If there is no such value, then return n. - */ -int DI arg_first_ge(const int* array, int n, int val) -{ - int start = 0; - int end = n - 1; - if (array[0] == val) return 0; - if (array[end] < val) return n; - while (start + 1 < end) { - int q = (start + end + 1) / 2; - // invariants: - // start < end - // start < q <=end - // array[start] < val && array[end] <=val - // at every iteration d = end-start is decreasing - // when d==0, then array[end] will be the first element >= val. - if (array[q] >= val) { - end = q; - } else { - start = q; - } - } - return end; -} -/** - * @brief Find the k-th occurrence of value in a sorted array. - * - * Assume that array is [0, 1, 1, 1, 2, 2, 4, 4, 4, 4, 6, 7] - * then find_nth_occurrence(cset, 12, 4, 2) == 7, because cset_array[7] stores - * the second element with value = 4. - * If there are less than k values in the array, then return -1 - * - * @param [in] array sorted array of numbers, size [n] - * @param [in] n number of elements in the array - * @param [in] val the value we are searching for - * @param [in] k - * @return the idx of the k-th occurance of val in array, or -1 if - * the value is not found. - */ -int DI find_nth_occurrence(const int* array, int n, int val, int k) -{ - int q = arg_first_ge(array, n, val); - if (q + k < n && array[q + k] == val) { - q += k; - } else { - q = -1; - } - return q; -} - /** - * @brief Rank the entries in a cache set according to the time stamp, return - * the indices that would sort the time stamp in ascending order. - * - * Assume we have a single cache set with time stamps as: - * key (threadIdx.x): 0 1 2 3 - * val (time stamp): 8 6 7 5 - * - * The corresponding sorted key-value pairs: - * key: 3 1 2 0 - * val: 5 6 7 8 - * rank: 0th 1st 2nd 3rd - * - * On return, the rank is assigned for each thread: - * threadIdx.x: 0 1 2 3 - * rank: 3 1 2 0 - * - * For multiple cache sets, launch one block per cache set. - * - * @tparam nthreads number of threads per block (nthreads <= associativity) - * @tparam associativity number of items in a cache set - * - * @param [in] cache_time time stamp of caching the data, - size [associativity * n_cache_sets] - * @param [in] n_cache_sets number of cache sets - * @param [out] rank within the cache set size [nthreads * items_per_thread] - * Each block should give a different pointer for rank. + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -template -DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank) -{ - const int items_per_thread = raft::ceildiv(associativity, nthreads); - typedef cub::BlockRadixSort BlockRadixSort; - __shared__ typename BlockRadixSort::TempStorage temp_storage; - - int key[items_per_thread]; - int val[items_per_thread]; - - int block_offset = blockIdx.x * associativity; - - for (int j = 0; j < items_per_thread; j++) { - int k = threadIdx.x + j * nthreads; - int t = (k < associativity) ? cache_time[block_offset + k] : 32768; - key[j] = t; - val[j] = k; - } - - BlockRadixSort(temp_storage).Sort(key, val); - - for (int j = 0; j < items_per_thread; j++) { - if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; } - } - __syncthreads(); -} /** - * @brief Assign cache location to a set of keys using LRU replacement policy. - * - * The keys and the corresponding cache_set arrays shall be sorted according - * to cache_set in ascending order. One block should be launched for every cache - * set. - * - * Each cache set is sorted according to time_stamp, and values from keys - * are filled in starting at the oldest time stamp. Entries that were accessed - * at the current time are not reassigned. - * - * @tparam nthreads number of threads per block - * @tparam associativity number of keys in a cache set - * - * @param [in] keys that we want to cache size [n] - * @param [in] n number of keys - * @param [in] cache_set assigned to keys, size [n] - * @param [inout] cached_keys keys of already cached vectors, - * size [n_cache_sets*associativity], on exit it will be updated with the - * cached elements from keys. - * @param [in] n_cache_sets number of cache sets - * @param [inout] cache_time will be updated to "time" for those elements that - * could be assigned to a cache location, size [n_cache_sets*associativity] - * @param [in] time time stamp - * @param [out] cache_idx the cache idx assigned to the input, or -1 if it could - * not be cached, size [n] + * DISCLAIMER: this file is deprecated: use lap.cuh instead */ -template -__global__ void assign_cache_idx(const int* keys, - int n, - const int* cache_set, - int* cached_keys, - int n_cache_sets, - int* cache_time, - int time, - int* cache_idx) -{ - int block_offset = blockIdx.x * associativity; - - const int items_per_thread = raft::ceildiv(associativity, nthreads); - - // the size of rank limits how large associativity can be used in practice - __shared__ int rank[items_per_thread * nthreads]; - rank_set_entries(cache_time, n_cache_sets, rank); - - // Each thread will fill items_per_thread items in the cache. - // It uses a place, only if it was not updated at the current time step - // (cache_time != time). - // We rank the places according to the time stamp, least recently used - // elements come to the front. - // We fill the least recently used elements with the working set. - // there might be elements which cannot be assigned to cache loc. - // these elements are assigned -1. - for (int j = 0; j < items_per_thread; j++) { - int i = threadIdx.x + j * nthreads; - int t_idx = block_offset + i; - bool mask = (i < associativity); - // whether this slot is available for writing - mask = mask && (cache_time[t_idx] != time); +#pragma once - // rank[i] tells which element to store by this thread - // we look up where is the corresponding key stored in the input array - if (mask) { - int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]); - if (k > -1) { - int key_val = keys[k]; - cached_keys[t_idx] = key_val; - cache_idx[k] = t_idx; - cache_time[t_idx] = time; - } - } - } -} +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/util version instead.") -/* Unnamed namespace is used to avoid multiple definition error for the - following non-template function */ -namespace { -/** - * @brief Get the cache indices for keys stored in the cache. - * - * For every key, we look up the corresponding cache position. - * If keys[k] is stored in the cache, then is_cached[k] is set to true, and - * cache_idx[k] stores the corresponding cache idx. - * - * If keys[k] is not stored in the cache, then we assign a cache set to it. - * This cache set is stored in cache_idx[k], and is_cached[k] is set to false. - * In this case AssignCacheIdx should be called, to get an assigned position - * within the cache set. - * - * Cache_time is assigned to the time input argument for all elements in idx. - * - * @param [in] keys array of keys that we want to look up in the cache, size [n] - * @param [in] n number of keys to look up - * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity] - * @param [in] n_cache_sets number of cache sets - * @param [in] associativity number of keys in cache set - * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * - * associativity] - * @param [out] cache_idx cache indices of the working set elements, size [n] - * @param [out] is_cached whether the element is cached size[n] - * @param [in] time iteration counter (used for time stamping) - */ -__global__ void get_cache_idx(int* keys, - int n, - int* cached_keys, - int n_cache_sets, - int associativity, - int* cache_time, - int* cache_idx, - bool* is_cached, - int time) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid < n) { - int widx = keys[tid]; - int sidx = hash(widx, n_cache_sets); - int cidx = sidx * associativity; - int i = 0; - bool found = false; - // search for empty spot and the least recently used spot - while (i < associativity && !found) { - found = (cache_time[cidx + i] > 0 && cached_keys[cidx + i] == widx); - i++; - } - is_cached[tid] = found; - if (found) { - cidx = cidx + i - 1; - cache_time[cidx] = time; // update time stamp - cache_idx[tid] = cidx; // exact cache idx - } else { - cache_idx[tid] = sidx; // assign cache set - } - } -} -}; // end unnamed namespace -}; // namespace cache -}; // namespace raft +#include diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/cluster/detail/agglomerative.cuh similarity index 97% rename from cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh rename to cpp/include/raft/cluster/detail/agglomerative.cuh index c8a1eb8304..618f852bba 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/cluster/detail/agglomerative.cuh @@ -16,9 +16,9 @@ #pragma once -#include -#include -#include +#include +#include +#include #include @@ -35,11 +35,7 @@ #include -namespace raft { - -namespace hierarchy { -namespace detail { - +namespace raft::cluster::detail { template class UnionFind { public: @@ -329,6 +325,4 @@ void extract_flattened_clusters(const raft::handle_t& handle, } } -}; // namespace detail -}; // namespace hierarchy -}; // namespace raft +}; // namespace raft::cluster::detail diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh similarity index 86% rename from cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh rename to cpp/include/raft/cluster/detail/connectivities.cuh index f56366f21f..da8adf783d 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/cluster/detail/connectivities.cuh @@ -16,18 +16,18 @@ #pragma once -#include -#include -#include +#include +#include +#include #include #include -#include +#include +#include #include #include -#include -#include +#include #include #include @@ -35,11 +35,9 @@ #include -namespace raft { -namespace hierarchy { -namespace detail { +namespace raft::cluster::detail { -template +template struct distance_graph_impl { void run(const raft::handle_t& handle, const value_t* X, @@ -58,7 +56,7 @@ struct distance_graph_impl { * @tparam value_t */ template -struct distance_graph_impl { +struct distance_graph_impl { void run(const raft::handle_t& handle, const value_t* X, size_t m, @@ -75,7 +73,7 @@ struct distance_graph_impl knn_graph_coo(stream); - raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); + raft::sparse::spatial::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); indices.resize(knn_graph_coo.nnz, stream); data.resize(knn_graph_coo.nnz, stream); @@ -121,7 +119,7 @@ struct distance_graph_impl +template void get_distance_graph(const raft::handle_t& handle, const value_t* X, size_t m, @@ -140,6 +138,4 @@ void get_distance_graph(const raft::handle_t& handle, dist_graph.run(handle, X, m, n, metric, indptr, indices, data, c); } -}; // namespace detail -}; // namespace hierarchy -}; // namespace raft +}; // namespace raft::cluster::detail diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh index 303de77078..26005f58a0 100644 --- a/cpp/include/raft/cluster/detail/kmeans.cuh +++ b/cpp/include/raft/cluster/detail/kmeans.cuh @@ -27,19 +27,21 @@ #include #include -#include +#include #include +#include #include +#include #include #include -#include -#include +#include #include #include #include #include #include #include +#include #include #include @@ -561,12 +563,12 @@ void initScalableKMeansPlusPlus(const raft::handle_t& handle, X.data_handle() + cIdx * n_features, 1, n_features); // flag the sample that is chosen as initial centroid - std::vector h_isSampleCentroid(n_samples); + std::vector h_isSampleCentroid(n_samples); std::fill(h_isSampleCentroid.begin(), h_isSampleCentroid.end(), 0); h_isSampleCentroid[cIdx] = 1; // device buffer to flag the sample that is chosen as initial centroid - auto isSampleCentroid = raft::make_device_vector(handle, n_samples); + auto isSampleCentroid = raft::make_device_vector(handle, n_samples); raft::copy( isSampleCentroid.data_handle(), h_isSampleCentroid.data(), isSampleCentroid.size(), stream); @@ -798,6 +800,17 @@ void kmeans_fit(handle_t const& handle, RAFT_EXPECTS(centroids.extent(1) == n_features, "invalid parameter (centroids.extent(1) != n_features)"); + // Display a warning if batch_centroids is set and a fusedL2NN-compatible metric is used + if (params.batch_centroids != 0 && params.batch_centroids != params.n_clusters && + (params.metric == raft::distance::DistanceType::L2Expanded || + params.metric == raft::distance::DistanceType::L2SqrtExpanded)) { + RAFT_LOG_INFO( + "batch_centroids=%d was passed, but batch_centroids=%d will be used (reason: " + "batch_centroids has no impact on the memory footprint when FusedL2NN can be used)", + params.batch_centroids, + params.n_clusters); + } + logger::get(RAFT_NAME).set_level(params.verbosity); // Allocate memory diff --git a/cpp/include/raft/cluster/detail/kmeans_common.cuh b/cpp/include/raft/cluster/detail/kmeans_common.cuh index 358c8ce16e..e9929a089d 100644 --- a/cpp/include/raft/cluster/detail/kmeans_common.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_common.cuh @@ -27,14 +27,14 @@ #include #include -#include +#include #include +#include #include #include #include -#include #include -#include +#include #include #include #include @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -50,44 +51,16 @@ namespace raft { namespace cluster { namespace detail { -template -struct FusedL2NNReduceOp { - IndexT offset; - - FusedL2NNReduceOp(IndexT _offset) : offset(_offset){}; - - typedef typename cub::KeyValuePair KVP; - DI void operator()(IndexT rit, KVP* out, const KVP& other) - { - if (other.value < out->value) { - out->key = offset + other.key; - out->value = other.value; - } - } - - DI void operator()(IndexT rit, DataT* out, const KVP& other) - { - if (other.value < *out) { *out = other.value; } - } - - DI void init(DataT* out, DataT maxVal) { *out = maxVal; } - DI void init(KVP* out, DataT maxVal) - { - out->key = -1; - out->value = maxVal; - } -}; - template struct SamplingOp { DataT* rnd; - int* flag; + uint8_t* flag; DataT cluster_cost; double oversampling_factor; IndexT n_clusters; CUB_RUNTIME_FUNCTION __forceinline__ - SamplingOp(DataT c, double l, IndexT k, DataT* rand, int* ptr) + SamplingOp(DataT c, double l, IndexT k, DataT* rand, uint8_t* ptr) : cluster_cost(c), oversampling_factor(l), n_clusters(k), rnd(rand), flag(ptr) { } @@ -239,7 +212,7 @@ template void sampleCentroids(const raft::handle_t& handle, const raft::device_matrix_view& X, const raft::device_vector_view& minClusterDistance, - const raft::device_vector_view& isSampleCentroid, + const raft::device_vector_view& isSampleCentroid, SamplingOp& select_op, rmm::device_uvector& inRankCp, rmm::device_uvector& workspace) @@ -277,7 +250,7 @@ void sampleCentroids(const raft::handle_t& handle, raft::copy(&nPtsSampledInRank, nSelected.data_handle(), 1, stream); handle.sync_stream(stream); - IndexT* rawPtr_isSampleCentroid = isSampleCentroid.data_handle(); + uint8_t* rawPtr_isSampleCentroid = isSampleCentroid.data_handle(); thrust::for_each_n(handle.get_thrust_policy(), sampledMinClusterDistance.data_handle(), nPtsSampledInRank, @@ -345,13 +318,13 @@ void shuffleAndGather(const raft::handle_t& handle, if (workspace) { // shuffle indices on device - raft::random::permute(indices.data_handle(), - nullptr, - nullptr, - (IndexT)in.extent(1), - (IndexT)in.extent(0), - true, - stream); + raft::random::permute(indices.data_handle(), + nullptr, + nullptr, + (IndexT)in.extent(1), + (IndexT)in.extent(0), + true, + stream); } else { // shuffle indices on host and copy to device... std::vector ht_indices(n_samples); @@ -442,41 +415,35 @@ void minClusterAndDistanceCompute( auto L2NormXView = raft::make_device_vector_view(L2NormX.data_handle() + dIdx, ns); - // tile over the centroids - for (IndexT cIdx = 0; cIdx < n_clusters; cIdx += centroidsBatchSize) { - // # of centroids for the current batch - auto nc = std::min((IndexT)centroidsBatchSize, n_clusters - cIdx); - - // centroidsView [nc x n_features] - view representing the current batch - // of centroids - auto centroidsView = raft::make_device_matrix_view( - centroids.data_handle() + (cIdx * n_features), nc, n_features); + if (metric == raft::distance::DistanceType::L2Expanded || + metric == raft::distance::DistanceType::L2SqrtExpanded) { + workspace.resize((sizeof(int)) * ns, stream); + + // todo(lsugy): remove cIdx + raft::distance::fusedL2NNMinReduce, IndexT>( + minClusterAndDistanceView.data_handle(), + datasetView.data_handle(), + centroids.data_handle(), + L2NormXView.data_handle(), + centroidsNorm.data_handle(), + ns, + n_clusters, + n_features, + (void*)workspace.data(), + metric != raft::distance::DistanceType::L2Expanded, + false, + stream); + } else { + // tile over the centroids + for (IndexT cIdx = 0; cIdx < n_clusters; cIdx += centroidsBatchSize) { + // # of centroids for the current batch + auto nc = std::min((IndexT)centroidsBatchSize, n_clusters - cIdx); + + // centroidsView [nc x n_features] - view representing the current batch + // of centroids + auto centroidsView = raft::make_device_matrix_view( + centroids.data_handle() + (cIdx * n_features), nc, n_features); - if (metric == raft::distance::DistanceType::L2Expanded || - metric == raft::distance::DistanceType::L2SqrtExpanded) { - auto centroidsNormView = - raft::make_device_vector_view(centroidsNorm.data_handle() + cIdx, nc); - workspace.resize((sizeof(int)) * ns, stream); - - FusedL2NNReduceOp redOp(cIdx); - raft::distance::KVPMinReduce pairRedOp; - - raft::distance::fusedL2NN, IndexT>( - minClusterAndDistanceView.data_handle(), - datasetView.data_handle(), - centroidsView.data_handle(), - L2NormXView.data_handle(), - centroidsNormView.data_handle(), - ns, - nc, - n_features, - (void*)workspace.data(), - redOp, - pairRedOp, - (metric == raft::distance::DistanceType::L2Expanded) ? false : true, - false, - stream); - } else { // pairwiseDistanceView [ns x nc] - view representing the pairwise // distance for current batch auto pairwiseDistanceView = @@ -577,40 +544,35 @@ void minClusterDistanceCompute(const raft::handle_t& handle, auto L2NormXView = raft::make_device_vector_view(L2NormX.data_handle() + dIdx, ns); - // tile over the centroids - for (IndexT cIdx = 0; cIdx < n_clusters; cIdx += centroidsBatchSize) { - // # of centroids for the current batch - auto nc = std::min((IndexT)centroidsBatchSize, n_clusters - cIdx); - - // centroidsView [nc x n_features] - view representing the current batch - // of centroids - auto centroidsView = raft::make_device_matrix_view( - centroids.data_handle() + cIdx * n_features, nc, n_features); - - if (metric == raft::distance::DistanceType::L2Expanded || - metric == raft::distance::DistanceType::L2SqrtExpanded) { - auto centroidsNormView = - raft::make_device_vector_view(centroidsNorm.data_handle() + cIdx, nc); - workspace.resize((sizeof(IndexT)) * ns, stream); - - FusedL2NNReduceOp redOp(cIdx); - raft::distance::KVPMinReduce pairRedOp; - raft::distance::fusedL2NN( - minClusterDistanceView.data_handle(), - datasetView.data_handle(), - centroidsView.data_handle(), - L2NormXView.data_handle(), - centroidsNormView.data_handle(), - ns, - nc, - n_features, - (void*)workspace.data(), - redOp, - pairRedOp, - (metric != raft::distance::DistanceType::L2Expanded), - false, - stream); - } else { + if (metric == raft::distance::DistanceType::L2Expanded || + metric == raft::distance::DistanceType::L2SqrtExpanded) { + workspace.resize((sizeof(IndexT)) * ns, stream); + + // todo(lsugy): remove cIdx + raft::distance::fusedL2NNMinReduce( + minClusterDistanceView.data_handle(), + datasetView.data_handle(), + centroids.data_handle(), + L2NormXView.data_handle(), + centroidsNorm.data_handle(), + ns, + n_clusters, + n_features, + (void*)workspace.data(), + metric != raft::distance::DistanceType::L2Expanded, + false, + stream); + } else { + // tile over the centroids + for (IndexT cIdx = 0; cIdx < n_clusters; cIdx += centroidsBatchSize) { + // # of centroids for the current batch + auto nc = std::min((IndexT)centroidsBatchSize, n_clusters - cIdx); + + // centroidsView [nc x n_features] - view representing the current batch + // of centroids + auto centroidsView = raft::make_device_matrix_view( + centroids.data_handle() + cIdx * n_features, nc, n_features); + // pairwiseDistanceView [ns x nc] - view representing the pairwise // distance for current batch auto pairwiseDistanceView = diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh index d57fd5254a..2746b6f657 100644 --- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh @@ -42,13 +42,13 @@ #include #include -#include -#include -#include -#include +#include #include #include #include +#include +#include +#include namespace raft { namespace cluster { diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh similarity index 86% rename from cpp/include/raft/sparse/hierarchy/detail/mst.cuh rename to cpp/include/raft/cluster/detail/mst.cuh index 545a371850..67935d4623 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/cluster/detail/mst.cuh @@ -16,25 +16,23 @@ #pragma once -#include -#include +#include +#include -#include #include -#include +#include +#include #include #include #include #include -namespace raft { -namespace hierarchy { -namespace detail { +namespace raft::cluster::detail { template -void merge_msts(raft::Graph_COO& coo1, - raft::Graph_COO& coo2, +void merge_msts(sparse::solver::Graph_COO& coo1, + sparse::solver::Graph_COO& coo2, cudaStream_t stream) { /** Add edges to existing mst **/ @@ -71,7 +69,7 @@ template void connect_knn_graph( const raft::handle_t& handle, const value_t* X, - raft::Graph_COO& msf, + sparse::solver::Graph_COO& msf, size_t m, size_t n, value_idx* color, @@ -82,7 +80,7 @@ void connect_knn_graph( raft::sparse::COO connected_edges(stream); - raft::linkage::connect_components( + raft::sparse::spatial::connect_components( handle, connected_edges, X, color, m, n, reduction_op); rmm::device_uvector indptr2(m + 1, stream); @@ -91,16 +89,17 @@ void connect_knn_graph( // On the second call, we hand the MST the original colors // and the new set of edges and let it restart the optimization process - auto new_mst = raft::mst::mst(handle, - indptr2.data(), - connected_edges.cols(), - connected_edges.vals(), - m, - connected_edges.nnz, - color, - stream, - false, - false); + auto new_mst = + raft::sparse::solver::mst(handle, + indptr2.data(), + connected_edges.cols(), + connected_edges.vals(), + m, + connected_edges.nnz, + color, + stream, + false, + false); merge_msts(msf, new_mst, stream); } @@ -150,18 +149,18 @@ void build_sorted_mst( auto stream = handle.get_stream(); // We want to have MST initialize colors on first call. - auto mst_coo = raft::mst::mst( + auto mst_coo = raft::sparse::solver::mst( handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true); int iters = 1; - int n_components = linkage::get_n_components(color, m, stream); + int n_components = raft::sparse::spatial::get_n_components(color, m, stream); while (n_components > 1 && iters < max_iter) { connect_knn_graph(handle, X, mst_coo, m, n, color, reduction_op); iters++; - n_components = linkage::get_n_components(color, m, stream); + n_components = raft::sparse::spatial::get_n_components(color, m, stream); } /** @@ -192,6 +191,4 @@ void build_sorted_mst( raft::copy_async(mst_weight, mst_coo.weights.data(), mst_coo.n_edges, stream); } -}; // namespace detail -}; // namespace hierarchy -}; // namespace raft +}; // namespace raft::cluster::detail diff --git a/cpp/include/raft/sparse/hierarchy/detail/single_linkage.cuh b/cpp/include/raft/cluster/detail/single_linkage.cuh similarity index 90% rename from cpp/include/raft/sparse/hierarchy/detail/single_linkage.cuh rename to cpp/include/raft/cluster/detail/single_linkage.cuh index 4e94b6f65d..9eee21b09c 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/single_linkage.cuh +++ b/cpp/include/raft/cluster/detail/single_linkage.cuh @@ -16,17 +16,15 @@ #pragma once -#include +#include #include -#include -#include -#include -#include +#include +#include +#include +#include -namespace raft { -namespace hierarchy { -namespace detail { +namespace raft::cluster::detail { static const size_t EMPTY = 0; @@ -56,7 +54,7 @@ void single_linkage(const raft::handle_t& handle, size_t m, size_t n, raft::distance::DistanceType metric, - linkage_output* out, + linkage_output* out, int c, size_t n_clusters) { @@ -82,7 +80,7 @@ void single_linkage(const raft::handle_t& handle, * 2. Construct MST, sorted by weights */ rmm::device_uvector color(m, stream); - raft::linkage::FixConnectivitiesRedOp op(color.data(), m); + raft::sparse::spatial::FixConnectivitiesRedOp op(color.data(), m); detail::build_sorted_mst(handle, X, indptr.data(), @@ -123,6 +121,4 @@ void single_linkage(const raft::handle_t& handle, out->n_leaves = m; out->n_connected_components = 1; } -}; // namespace detail -}; // namespace hierarchy -}; // namespace raft \ No newline at end of file +}; // namespace raft::cluster::detail \ No newline at end of file diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index d46f53d9c1..539fc33c40 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -17,12 +17,10 @@ #include #include -#include +#include #include -namespace raft { -namespace cluster { - +namespace raft::cluster { /** * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty @@ -488,5 +486,4 @@ void kmeans_fit_main(const raft::handle_t& handle, detail::kmeans_fit_main( handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace); } -} // namespace cluster -} // namespace raft +} // namespace raft::cluster diff --git a/cpp/include/raft/cluster/kmeans_params.hpp b/cpp/include/raft/cluster/kmeans_params.hpp index 70ea49d36d..433e32f5ff 100644 --- a/cpp/include/raft/cluster/kmeans_params.hpp +++ b/cpp/include/raft/cluster/kmeans_params.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,61 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once -#include -#include -#include - -namespace raft { -namespace cluster { - -struct KMeansParams { - enum InitMethod { KMeansPlusPlus, Random, Array }; - - // The number of clusters to form as well as the number of centroids to - // generate (default:8). - int n_clusters = 8; - - /* - * Method for initialization, defaults to k-means++: - * - InitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm - * to select the initial cluster centers. - * - InitMethod::Random (random): Choose 'n_clusters' observations (rows) at - * random from the input data for the initial centroids. - * - InitMethod::Array (ndarray): Use 'centroids' as initial cluster centers. - */ - InitMethod init = KMeansPlusPlus; - - // Maximum number of iterations of the k-means algorithm for a single run. - int max_iter = 300; - - // Relative tolerance with regards to inertia to declare convergence. - double tol = 1e-4; - - // verbosity level. - int verbosity = RAFT_LEVEL_INFO; - - // Seed to the random number generator. - raft::random::RngState rng_state = - raft::random::RngState(0, raft::random::GeneratorType::GenPhilox); - - // Metric to use for distance computation. - raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded; +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. + */ - // Number of instance k-means algorithm will be run with different seeds. - int n_init = 1; +/** + * DISCLAIMER: this file is deprecated: use lap.cuh instead + */ - // Oversampling factor for use in the k-means|| algorithm. - double oversampling_factor = 2.0; +#pragma once - // batch_samples and batch_centroids are used to tile 1NN computation which is - // useful to optimize/control the memory footprint - // Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0 - // then don't tile the centroids - int batch_samples = 1 << 15; - int batch_centroids = 0; // if 0 then batch_centroids = n_clusters +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/cluster/kmeans_types.hpp version instead.") - bool inertia_check = false; -}; -} // namespace cluster -} // namespace raft +#include diff --git a/cpp/include/raft/cluster/kmeans_types.hpp b/cpp/include/raft/cluster/kmeans_types.hpp new file mode 100644 index 0000000000..87fc7c1880 --- /dev/null +++ b/cpp/include/raft/cluster/kmeans_types.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include + +namespace raft { +namespace cluster { + +struct KMeansParams { + enum InitMethod { KMeansPlusPlus, Random, Array }; + + // The number of clusters to form as well as the number of centroids to + // generate (default:8). + int n_clusters = 8; + + /* + * Method for initialization, defaults to k-means++: + * - InitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm + * to select the initial cluster centers. + * - InitMethod::Random (random): Choose 'n_clusters' observations (rows) at + * random from the input data for the initial centroids. + * - InitMethod::Array (ndarray): Use 'centroids' as initial cluster centers. + */ + InitMethod init = KMeansPlusPlus; + + // Maximum number of iterations of the k-means algorithm for a single run. + int max_iter = 300; + + // Relative tolerance with regards to inertia to declare convergence. + double tol = 1e-4; + + // verbosity level. + int verbosity = RAFT_LEVEL_INFO; + + // Seed to the random number generator. + raft::random::RngState rng_state = + raft::random::RngState(0, raft::random::GeneratorType::GenPhilox); + + // Metric to use for distance computation. + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded; + + // Number of instance k-means algorithm will be run with different seeds. + int n_init = 1; + + // Oversampling factor for use in the k-means|| algorithm. + double oversampling_factor = 2.0; + + // batch_samples and batch_centroids are used to tile 1NN computation which is + // useful to optimize/control the memory footprint + // Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0 + // then don't tile the centroids + int batch_samples = 1 << 15; + int batch_centroids = 0; // if 0 then batch_centroids = n_clusters + + bool inertia_check = false; +}; +} // namespace cluster +} // namespace raft diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh new file mode 100644 index 0000000000..8e33b8389d --- /dev/null +++ b/cpp/include/raft/cluster/single_linkage.cuh @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace raft::cluster { + +constexpr int DEFAULT_CONST_C = 15; + +/** + * Single-linkage clustering, capable of constructing a KNN graph to + * scale the algorithm beyond the n^2 memory consumption of implementations + * that use the fully-connected graph of pairwise distances by connecting + * a knn graph when k is not large enough to connect it. + + * @tparam value_idx + * @tparam value_t + * @tparam dist_type method to use for constructing connectivities graph + * @param[in] handle raft handle + * @param[in] X dense input matrix in row-major layout + * @param[in] m number of rows in X + * @param[in] n number of columns in X + * @param[in] metric distance metrix to use when constructing connectivities graph + * @param[out] out struct containing output dendrogram and cluster assignments + * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect + control + * of k. The algorithm will set `k = log(n) + c` + * @param[in] n_clusters number of clusters to assign data samples + */ +template +void single_linkage(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + linkage_output* out, + int c, + size_t n_clusters) +{ + detail::single_linkage( + handle, X, m, n, metric, out, c, n_clusters); +} + +/** + * Single-linkage clustering, capable of constructing a KNN graph to + * scale the algorithm beyond the n^2 memory consumption of implementations + * that use the fully-connected graph of pairwise distances by connecting + * a knn graph when k is not large enough to connect it. + + * @tparam value_idx + * @tparam value_t + * @tparam dist_type method to use for constructing connectivities graph + * @param[in] handle raft handle + * @param[in] X dense input matrix in row-major layout + * @param[out] dendrogram output dendrogram (size [n_rows - 1] * 2) + * @param[out] labels output labels vector (size n_rows) + * @param[in] metric distance metrix to use when constructing connectivities graph + * @param[in] n_clusters number of clusters to assign data samples + * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect + control of k. The algorithm will set `k = log(n) + c` + */ +template +void single_linkage(const raft::handle_t& handle, + raft::device_matrix_view X, + raft::device_matrix_view dendrogram, + raft::device_vector_view labels, + raft::distance::DistanceType metric, + size_t n_clusters, + std::optional c = std::make_optional(DEFAULT_CONST_C)) +{ + linkage_output out_arrs; + out_arrs.children = dendrogram.data_handle(); + out_arrs.labels = labels.data_handle(); + + single_linkage(handle, + X.data_handle(), + static_cast(X.extent(0)), + static_cast(X.extent(1)), + metric, + &out_arrs, + c.has_value() ? c.value() : DEFAULT_CONST_C, + n_clusters); +} + +}; // namespace raft::cluster diff --git a/cpp/include/raft/cluster/single_linkage_types.hpp b/cpp/include/raft/cluster/single_linkage_types.hpp new file mode 100644 index 0000000000..79f2ede482 --- /dev/null +++ b/cpp/include/raft/cluster/single_linkage_types.hpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::cluster { + +enum LinkageDistance { PAIRWISE = 0, KNN_GRAPH = 1 }; + +/** + * Simple POCO for consolidating linkage results. This closely + * mirrors the trained instance variables populated in + * Scikit-learn's AgglomerativeClustering estimator. + * @tparam value_idx + * @tparam value_t + */ +template +class linkage_output { + public: + idx_t m; + idx_t n_clusters; + + idx_t n_leaves; + idx_t n_connected_components; + + // TODO: These will be made private in a future release + idx_t* labels; // size: m + idx_t* children; // size: (m-1, 2) + + raft::device_vector_view get_labels() + { + return raft::make_device_vector_view(labels, m); + } + + raft::device_matrix_view get_children() + { + return raft::make_device_matrix_view(children, m - 1, 2); + } +}; + +class linkage_output_int : public linkage_output { +}; +class linkage_output_int64 : public linkage_output { +}; + +}; // namespace raft::cluster \ No newline at end of file diff --git a/cpp/include/raft/cluster/specializations.cuh b/cpp/include/raft/cluster/specializations.cuh new file mode 100644 index 0000000000..3bb5a26ace --- /dev/null +++ b/cpp/include/raft/cluster/specializations.cuh @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __CLUSTER_SPECIALIZATIONS_H +#define __CLUSTER_SPECIALIZATIONS_H + +#pragma once + +#include +#include + +#endif \ No newline at end of file diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh index 32a46968b6..e80d7cccd9 100644 --- a/cpp/include/raft/common/cub_wrappers.cuh +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,41 +13,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -#include -#include - -namespace raft { +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. + */ /** - * @brief Convenience wrapper over cub's SortPairs method - * @tparam KeyT key type - * @tparam ValueT value type - * @param workspace workspace buffer which will get resized if not enough space - * @param inKeys input keys array - * @param outKeys output keys array - * @param inVals input values array - * @param outVals output values array - * @param len array length - * @param stream cuda stream + * DISCLAIMER: this file is deprecated: use lanczos.cuh instead */ -template -void sortPairs(rmm::device_uvector& workspace, - const KeyT* inKeys, - KeyT* outKeys, - const ValueT* inVals, - ValueT* outVals, - int len, - cudaStream_t stream) -{ - size_t worksize; - cub::DeviceRadixSort::SortPairs( - nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); - workspace.resize(worksize, stream); - cub::DeviceRadixSort::SortPairs( - workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); -} -} // namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please note that there is no equivalent in RAFT's public API" + " so this file will eventually be removed altogether.") + +#include diff --git a/cpp/include/raft/common/detail/scatter.cuh b/cpp/include/raft/common/detail/scatter.cuh index 4087625320..87a8826aa6 100644 --- a/cpp/include/raft/common/detail/scatter.cuh +++ b/cpp/include/raft/common/detail/scatter.cuh @@ -16,8 +16,8 @@ #pragma once -#include -#include +#include +#include namespace raft::detail { diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh index 0c4750aa69..f3cfbd81cc 100644 --- a/cpp/include/raft/common/device_loads_stores.cuh +++ b/cpp/include/raft/common/device_loads_stores.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,526 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -#include - -namespace raft { - /** - * @defgroup SmemStores Shared memory store operations - * @{ - * @brief Stores to shared memory (both vectorized and non-vectorized forms) - * requires the given shmem pointer to be aligned by the vector - length, like for float4 lds/sts shmem pointer should be aligned - by 16 bytes else it might silently fail or can also give - runtime error. - * @param[out] addr shared memory address (should be aligned to vector size) - * @param[in] x data to be stored at this address + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -DI void sts(uint8_t* addr, const uint8_t& x) -{ - uint32_t x_int; - x_int = x; - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.u8 [%0], {%1};" : : "l"(s1), "r"(x_int)); -} -DI void sts(uint8_t* addr, const uint8_t (&x)[1]) -{ - uint32_t x_int[1]; - x_int[0] = x[0]; - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.u8 [%0], {%1};" : : "l"(s1), "r"(x_int[0])); -} -DI void sts(uint8_t* addr, const uint8_t (&x)[2]) -{ - uint32_t x_int[2]; - x_int[0] = x[0]; - x_int[1] = x[1]; - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.u8 [%0], {%1, %2};" : : "l"(s2), "r"(x_int[0]), "r"(x_int[1])); -} -DI void sts(uint8_t* addr, const uint8_t (&x)[4]) -{ - uint32_t x_int[4]; - x_int[0] = x[0]; - x_int[1] = x[1]; - x_int[2] = x[2]; - x_int[3] = x[3]; - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v4.u8 [%0], {%1, %2, %3, %4};" - : - : "l"(s4), "r"(x_int[0]), "r"(x_int[1]), "r"(x_int[2]), "r"(x_int[3])); -} - -DI void sts(int8_t* addr, const int8_t& x) -{ - int32_t x_int = x; - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.s8 [%0], {%1};" : : "l"(s1), "r"(x_int)); -} -DI void sts(int8_t* addr, const int8_t (&x)[1]) -{ - int32_t x_int[1]; - x_int[0] = x[0]; - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.s8 [%0], {%1};" : : "l"(s1), "r"(x_int[0])); -} -DI void sts(int8_t* addr, const int8_t (&x)[2]) -{ - int32_t x_int[2]; - x_int[0] = x[0]; - x_int[1] = x[1]; - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.s8 [%0], {%1, %2};" : : "l"(s2), "r"(x_int[0]), "r"(x_int[1])); -} -DI void sts(int8_t* addr, const int8_t (&x)[4]) -{ - int32_t x_int[4]; - x_int[0] = x[0]; - x_int[1] = x[1]; - x_int[2] = x[2]; - x_int[3] = x[3]; - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v4.s8 [%0], {%1, %2, %3, %4};" - : - : "l"(s4), "r"(x_int[0]), "r"(x_int[1]), "r"(x_int[2]), "r"(x_int[3])); -} - -DI void sts(uint32_t* addr, const uint32_t& x) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x)); -} -DI void sts(uint32_t* addr, const uint32_t (&x)[1]) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x[0])); -} -DI void sts(uint32_t* addr, const uint32_t (&x)[2]) -{ - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.u32 [%0], {%1, %2};" : : "l"(s2), "r"(x[0]), "r"(x[1])); -} -DI void sts(uint32_t* addr, const uint32_t (&x)[4]) -{ - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3])); -} - -DI void sts(int32_t* addr, const int32_t& x) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x)); -} -DI void sts(int32_t* addr, const int32_t (&x)[1]) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x[0])); -} -DI void sts(int32_t* addr, const int32_t (&x)[2]) -{ - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.u32 [%0], {%1, %2};" : : "l"(s2), "r"(x[0]), "r"(x[1])); -} -DI void sts(int32_t* addr, const int32_t (&x)[4]) -{ - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3])); -} - -DI void sts(float* addr, const float& x) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x)); -} -DI void sts(float* addr, const float (&x)[1]) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0])); -} -DI void sts(float* addr, const float (&x)[2]) -{ - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1])); -} -DI void sts(float* addr, const float (&x)[4]) -{ - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};" - : - : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3])); -} - -DI void sts(double* addr, const double& x) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x)); -} -DI void sts(double* addr, const double (&x)[1]) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0])); -} -DI void sts(double* addr, const double (&x)[2]) -{ - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1])); -} -/** @} */ /** - * @defgroup SmemLoads Shared memory load operations - * @{ - * @brief Loads from shared memory (both vectorized and non-vectorized forms) - requires the given shmem pointer to be aligned by the vector - length, like for float4 lds/sts shmem pointer should be aligned - by 16 bytes else it might silently fail or can also give - runtime error. - * @param[out] x the data to be loaded - * @param[in] addr shared memory address from where to load - * (should be aligned to vector size) + * DISCLAIMER: this file is deprecated: use lap.cuh instead */ -DI void lds(uint8_t& x, const uint8_t* addr) -{ - uint32_t x_int; - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.u8 {%0}, [%1];" : "=r"(x_int) : "l"(s1)); - x = x_int; -} -DI void lds(uint8_t (&x)[1], const uint8_t* addr) -{ - uint32_t x_int[1]; - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.u8 {%0}, [%1];" : "=r"(x_int[0]) : "l"(s1)); - x[0] = x_int[0]; -} -DI void lds(uint8_t (&x)[2], const uint8_t* addr) -{ - uint32_t x_int[2]; - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.u8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(s2)); - x[0] = x_int[0]; - x[1] = x_int[1]; -} -DI void lds(uint8_t (&x)[4], const uint8_t* addr) -{ - uint32_t x_int[4]; - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v4.u8 {%0, %1, %2, %3}, [%4];" - : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3]) - : "l"(s4)); - x[0] = x_int[0]; - x[1] = x_int[1]; - x[2] = x_int[2]; - x[3] = x_int[3]; -} - -DI void lds(int8_t& x, const int8_t* addr) -{ - int32_t x_int; - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.s8 {%0}, [%1];" : "=r"(x_int) : "l"(s1)); - x = x_int; -} -DI void lds(int8_t (&x)[1], const int8_t* addr) -{ - int32_t x_int[1]; - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.s8 {%0}, [%1];" : "=r"(x_int[0]) : "l"(s1)); - x[0] = x_int[0]; -} -DI void lds(int8_t (&x)[2], const int8_t* addr) -{ - int32_t x_int[2]; - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.s8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(s2)); - x[0] = x_int[0]; - x[1] = x_int[1]; -} -DI void lds(int8_t (&x)[4], const int8_t* addr) -{ - int32_t x_int[4]; - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v4.s8 {%0, %1, %2, %3}, [%4];" - : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3]) - : "l"(s4)); - x[0] = x_int[0]; - x[1] = x_int[1]; - x[2] = x_int[2]; - x[3] = x_int[3]; -} - -DI void lds(uint32_t (&x)[4], const uint32_t* addr) -{ - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3]) - : "l"(s4)); -} - -DI void lds(uint32_t (&x)[2], const uint32_t* addr) -{ - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(s2)); -} - -DI void lds(uint32_t (&x)[1], const uint32_t* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x[0]) : "l"(s1)); -} - -DI void lds(uint32_t& x, const uint32_t* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "l"(s1)); -} - -DI void lds(int32_t (&x)[4], const int32_t* addr) -{ - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3]) - : "l"(s4)); -} - -DI void lds(int32_t (&x)[2], const int32_t* addr) -{ - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(s2)); -} - -DI void lds(int32_t (&x)[1], const int32_t* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x[0]) : "l"(s1)); -} - -DI void lds(int32_t& x, const int32_t* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "l"(s1)); -} - -DI void lds(float& x, const float* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1)); -} -DI void lds(float (&x)[1], const float* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1)); -} -DI void lds(float (&x)[2], const float* addr) -{ - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2)); -} -DI void lds(float (&x)[4], const float* addr) -{ - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" - : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) - : "l"(s4)); -} - -DI void lds(float& x, float* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1)); -} -DI void lds(float (&x)[1], float* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1)); -} -DI void lds(float (&x)[2], float* addr) -{ - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2)); -} -DI void lds(float (&x)[4], float* addr) -{ - auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" - : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) - : "l"(s4)); -} -DI void lds(double& x, double* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1)); -} -DI void lds(double (&x)[1], double* addr) -{ - auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1)); -} -DI void lds(double (&x)[2], double* addr) -{ - auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2)); -} -/** @} */ - -/** - * @defgroup GlobalLoads Global cached load operations - * @{ - * @brief Load from global memory with caching at L1 level - * @param[out] x data to be loaded from global memory - * @param[in] addr address in global memory from where to load - */ -DI void ldg(float& x, const float* addr) -{ - asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr)); -} -DI void ldg(float (&x)[1], const float* addr) -{ - asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr)); -} -DI void ldg(float (&x)[2], const float* addr) -{ - asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr)); -} -DI void ldg(float (&x)[4], const float* addr) -{ - asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];" - : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) - : "l"(addr)); -} -DI void ldg(double& x, const double* addr) -{ - asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr)); -} -DI void ldg(double (&x)[1], const double* addr) -{ - asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr)); -} -DI void ldg(double (&x)[2], const double* addr) -{ - asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr)); -} - -DI void ldg(uint32_t (&x)[4], const uint32_t* const& addr) -{ - asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3]) - : "l"(addr)); -} - -DI void ldg(uint32_t (&x)[2], const uint32_t* const& addr) -{ - asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(addr)); -} - -DI void ldg(uint32_t (&x)[1], const uint32_t* const& addr) -{ - asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x[0]) : "l"(addr)); -} - -DI void ldg(uint32_t& x, const uint32_t* const& addr) -{ - asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr)); -} - -DI void ldg(int32_t (&x)[4], const int32_t* const& addr) -{ - asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3]) - : "l"(addr)); -} - -DI void ldg(int32_t (&x)[2], const int32_t* const& addr) -{ - asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(addr)); -} - -DI void ldg(int32_t (&x)[1], const int32_t* const& addr) -{ - asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x[0]) : "l"(addr)); -} - -DI void ldg(int32_t& x, const int32_t* const& addr) -{ - asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr)); -} - -DI void ldg(uint8_t (&x)[4], const uint8_t* const& addr) -{ - uint32_t x_int[4]; - asm volatile("ld.global.cg.v4.u8 {%0, %1, %2, %3}, [%4];" - : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3]) - : "l"(addr)); - x[0] = x_int[0]; - x[1] = x_int[1]; - x[2] = x_int[2]; - x[3] = x_int[3]; -} - -DI void ldg(uint8_t (&x)[2], const uint8_t* const& addr) -{ - uint32_t x_int[2]; - asm volatile("ld.global.cg.v2.u8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(addr)); - x[0] = x_int[0]; - x[1] = x_int[1]; -} - -DI void ldg(uint8_t (&x)[1], const uint8_t* const& addr) -{ - uint32_t x_int; - asm volatile("ld.global.cg.u8 %0, [%1];" : "=r"(x_int) : "l"(addr)); - x[0] = x_int; -} - -DI void ldg(uint8_t& x, const uint8_t* const& addr) -{ - uint32_t x_int; - asm volatile("ld.global.cg.u8 %0, [%1];" : "=r"(x_int) : "l"(addr)); - x = x_int; -} - -DI void ldg(int8_t (&x)[4], const int8_t* const& addr) -{ - int x_int[4]; - asm volatile("ld.global.cg.v4.s8 {%0, %1, %2, %3}, [%4];" - : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3]) - : "l"(addr)); - x[0] = x_int[0]; - x[1] = x_int[1]; - x[2] = x_int[2]; - x[3] = x_int[3]; -} - -DI void ldg(int8_t (&x)[2], const int8_t* const& addr) -{ - int x_int[2]; - asm volatile("ld.global.cg.v2.s8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(addr)); - x[0] = x_int[0]; - x[1] = x_int[1]; -} - -DI void ldg(int8_t& x, const int8_t* const& addr) -{ - int x_int; - asm volatile("ld.global.cg.s8 %0, [%1];" : "=r"(x_int) : "l"(addr)); - x = x_int; -} - -DI void ldg(int8_t (&x)[1], const int8_t* const& addr) -{ - int x_int; - asm volatile("ld.global.cg.s8 %0, [%1];" : "=r"(x_int) : "l"(addr)); - x[0] = x_int; -} +#pragma once -/** @} */ +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/util version instead.") -} // namespace raft +#include diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh index 9735ccdf2b..0e83f9a5cd 100644 --- a/cpp/include/raft/common/scatter.cuh +++ b/cpp/include/raft/common/scatter.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,56 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -#include -#include - -namespace raft { +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. + */ /** - * @brief Performs scatter operation based on the input indexing array - * @tparam DataT data type whose array gets scattered - * @tparam IdxT indexing type - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Lambda the device-lambda performing a unary operation on the loaded - * data before it gets scattered - * @param out the output array - * @param in the input array - * @param idx the indexing array - * @param len number of elements in the input array - * @param stream cuda stream where to launch work - * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This - * will be applied to every element before scattering it to the right location. - * The second param in this method will be the destination index. + * DISCLAIMER: this file is deprecated: use lap.cuh instead */ -template , int TPB = 256> -void scatter(DataT* out, - const DataT* in, - const IdxT* idx, - IdxT len, - cudaStream_t stream, - Lambda op = raft::Nop()) -{ - if (len <= 0) return; - constexpr size_t DataSize = sizeof(DataT); - constexpr size_t IdxSize = sizeof(IdxT); - constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize; - size_t bytes = len * MaxPerElem; - if (16 / MaxPerElem && bytes % 16 == 0) { - detail::scatterImpl(out, in, idx, len, op, stream); - } else if (8 / MaxPerElem && bytes % 8 == 0) { - detail::scatterImpl(out, in, idx, len, op, stream); - } else if (4 / MaxPerElem && bytes % 4 == 0) { - detail::scatterImpl(out, in, idx, len, op, stream); - } else if (2 / MaxPerElem && bytes % 2 == 0) { - detail::scatterImpl(out, in, idx, len, op, stream); - } else if (1 / MaxPerElem) { - detail::scatterImpl(out, in, idx, len, op, stream); - } else { - detail::scatterImpl(out, in, idx, len, op, stream); - } -} -} // namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/matrix version instead.") + +#include diff --git a/cpp/include/raft/common/seive.hpp b/cpp/include/raft/common/seive.hpp index e613f1e5c2..633c8dd3e1 100644 --- a/cpp/include/raft/common/seive.hpp +++ b/cpp/include/raft/common/seive.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,113 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once - -#include -#include - -// Taken from: -// https://github.com/teju85/programming/blob/master/euler/include/seive.h - -namespace raft { -namespace common { - /** - * @brief Implementation of 'Seive of Eratosthenes' + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -class Seive { - public: - /** - * @param _num number of integers for which seive is needed - */ - Seive(unsigned _num) - { - N = _num; - generateSeive(); - } - - /** - * @brief Check whether a number is prime or not - * @param num number to be checked - * @return true if the 'num' is prime, else false - */ - bool isPrime(unsigned num) const - { - unsigned mask, pos; - if (num <= 1) { return false; } - if (num == 2) { return true; } - if (!(num & 1)) { return false; } - getMaskPos(num, mask, pos); - return (seive[pos] & mask); - } - private: - void generateSeive() - { - auto sqN = fastIntSqrt(N); - auto size = raft::ceildiv(N, sizeof(unsigned) * 8); - seive.resize(size); - // assume all to be primes initially - for (auto& itr : seive) { - itr = 0xffffffffu; - } - unsigned cid = 0; - unsigned cnum = getNum(cid); - while (cnum <= sqN) { - do { - ++cid; - cnum = getNum(cid); - if (isPrime(cnum)) { break; } - } while (cnum <= sqN); - auto cnum2 = cnum << 1; - // 'unmark' all the 'odd' multiples of the current prime - for (unsigned i = 3, num = i * cnum; num <= N; i += 2, num += cnum2) { - unmark(num); - } - } - } - - unsigned getId(unsigned num) const { return (num >> 1); } - - unsigned getNum(unsigned id) const - { - if (id == 0) { return 2; } - return ((id << 1) + 1); - } - - void getMaskPos(unsigned num, unsigned& mask, unsigned& pos) const - { - pos = getId(num); - mask = 1 << (pos & 0x1f); - pos >>= 5; - } +/** + * DISCLAIMER: this file is deprecated: use lap.cuh instead + */ - void unmark(unsigned num) - { - unsigned mask, pos; - getMaskPos(num, mask, pos); - seive[pos] &= ~mask; - } +#pragma once - // REF: http://www.azillionmonkeys.com/qed/ulerysqroot.pdf - unsigned fastIntSqrt(unsigned val) - { - unsigned g = 0; - auto bshft = 15u, b = 1u << bshft; - do { - unsigned temp = ((g << 1) + b) << bshft--; - if (val >= temp) { - g += b; - val -= temp; - } - } while (b >>= 1); - return g; - } +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/util version instead.") - /** find all primes till this number */ - unsigned N; - /** the seive */ - std::vector seive; -}; -}; // namespace common -}; // namespace raft +#include diff --git a/cpp/include/raft/comms/comms_test.hpp b/cpp/include/raft/comms/comms_test.hpp index f01060cb40..c7e5dd3ab6 100644 --- a/cpp/include/raft/comms/comms_test.hpp +++ b/cpp/include/raft/comms/comms_test.hpp @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { namespace comms { diff --git a/cpp/include/raft/comms/detail/mpi_comms.hpp b/cpp/include/raft/comms/detail/mpi_comms.hpp index 3bf5438296..508a9ce717 100644 --- a/cpp/include/raft/comms/detail/mpi_comms.hpp +++ b/cpp/include/raft/comms/detail/mpi_comms.hpp @@ -28,9 +28,9 @@ #include #include -#include -#include -#include +#include +#include +#include #include #include diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp index 2be1310c50..e64c6d9bf0 100644 --- a/cpp/include/raft/comms/detail/std_comms.hpp +++ b/cpp/include/raft/comms/detail/std_comms.hpp @@ -20,13 +20,13 @@ #include #include -#include +#include #include #include -#include +#include -#include +#include #include diff --git a/cpp/include/raft/comms/detail/test.hpp b/cpp/include/raft/comms/detail/test.hpp index d81d7c80fb..6ba4be3886 100644 --- a/cpp/include/raft/comms/detail/test.hpp +++ b/cpp/include/raft/comms/detail/test.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp index 79976811ed..668acafae4 100644 --- a/cpp/include/raft/comms/detail/ucp_helper.hpp +++ b/cpp/include/raft/comms/detail/ucp_helper.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include #include diff --git a/cpp/include/raft/comms/detail/util.hpp b/cpp/include/raft/comms/detail/util.hpp index ff564603e1..969a8789dd 100644 --- a/cpp/include/raft/comms/detail/util.hpp +++ b/cpp/include/raft/comms/detail/util.hpp @@ -19,7 +19,7 @@ #include #include -#include +#include #include /** diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index b1aae86556..f6b63ac971 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 7604606ba1..edace60fbd 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/include/raft/core/comms.hpp b/cpp/include/raft/core/comms.hpp index 7f0aa74960..771f38fee3 100644 --- a/cpp/include/raft/core/comms.hpp +++ b/cpp/include/raft/core/comms.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp index f5de57677d..d2456433ab 100644 --- a/cpp/include/raft/core/cublas_macros.hpp +++ b/cpp/include/raft/core/cublas_macros.hpp @@ -20,7 +20,7 @@ #pragma once #include -#include +#include ///@todo: enable this once we have logger enabled //#include diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp index e0957ea1f3..591f41629d 100644 --- a/cpp/include/raft/core/cudart_utils.hpp +++ b/cpp/include/raft/core/cudart_utils.hpp @@ -16,484 +16,8 @@ /** * This file is deprecated and will be removed in release 22.06. - * Please use raft_runtime/cudart_utils.hpp instead. + * Please use util/cudart_utils.hpp instead. */ -#ifndef __RAFT_RT_CUDART_UTILS_H -#define __RAFT_RT_CUDART_UTILS_H - #pragma once - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -///@todo: enable once logging has been enabled in raft -//#include "logger.hpp" - -namespace raft { - -/** - * @brief Exception thrown when a CUDA error is encountered. - */ -struct cuda_error : public raft::exception { - explicit cuda_error(char const* const message) : raft::exception(message) {} - explicit cuda_error(std::string const& message) : raft::exception(message) {} -}; - -} // namespace raft - -/** - * @brief Error checking macro for CUDA runtime API functions. - * - * Invokes a CUDA runtime API function call, if the call does not return - * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an - * exception detailing the CUDA error that occurred - * - */ -#define RAFT_CUDA_TRY(call) \ - do { \ - cudaError_t const status = call; \ - if (status != cudaSuccess) { \ - cudaGetLastError(); \ - std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "CUDA error encountered at: ", \ - "call='%s', Reason=%s:%s", \ - #call, \ - cudaGetErrorName(status), \ - cudaGetErrorString(status)); \ - throw raft::cuda_error(msg); \ - } \ - } while (0) - -// FIXME: Remove after consumers rename -#ifndef CUDA_TRY -#define CUDA_TRY(call) RAFT_CUDA_TRY(call) -#endif - -/** - * @brief Debug macro to check for CUDA errors - * - * In a non-release build, this macro will synchronize the specified stream - * before error checking. In both release and non-release builds, this macro - * checks for any pending CUDA errors from previous calls. If an error is - * reported, an exception is thrown detailing the CUDA error that occurred. - * - * The intent of this macro is to provide a mechanism for synchronous and - * deterministic execution for debugging asynchronous CUDA execution. It should - * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an - * asynchronous kernel launch. - */ -#ifndef NDEBUG -#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); -#else -#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError()); -#endif - -// FIXME: Remove after consumers rename -#ifndef CHECK_CUDA -#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call) -#endif - -/** FIXME: remove after cuml rename */ -#ifndef CUDA_CHECK -#define CUDA_CHECK(call) RAFT_CUDA_TRY(call) -#endif - -// /** -// * @brief check for cuda runtime API errors but log error instead of raising -// * exception. -// */ -#define RAFT_CUDA_TRY_NO_THROW(call) \ - do { \ - cudaError_t const status = call; \ - if (cudaSuccess != status) { \ - printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ - #call, \ - __FILE__, \ - __LINE__, \ - cudaGetErrorString(status)); \ - } \ - } while (0) - -// FIXME: Remove after cuml rename -#ifndef CUDA_CHECK_NO_THROW -#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call) -#endif - -/** - * Alias to raft scope for now. - * TODO: Rename original implementations in 22.04 to fix - * https://github.com/rapidsai/raft/issues/128 - */ - -namespace raft { - -/** Helper method to get to know warp size in device code */ -__host__ __device__ constexpr inline int warp_size() { return 32; } - -__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; } - -/** - * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping - * elements to threads. - */ -class grid_1d_thread_t { - public: - int const block_size{0}; - int const num_blocks{0}; - - /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - * @param max_num_blocks_1d maximum number of blocks in 1d grid - * @param elements_per_thread Typically, a single kernel thread processes more than a single - * element; this affects the number of threads the grid must contain - */ - grid_1d_thread_t(size_t overall_num_elements, - size_t num_threads_per_block, - size_t max_num_blocks_1d, - size_t elements_per_thread = 1) - : block_size(num_threads_per_block), - num_blocks( - std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) / - (elements_per_thread * num_threads_per_block), - max_num_blocks_1d)) - { - RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); - RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, - "num_threads_per_block / warp_size() must be > 0"); - RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0"); - } -}; - -/** - * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping - * elements to warps. - */ -class grid_1d_warp_t { - public: - int const block_size{0}; - int const num_blocks{0}; - - /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - * @param max_num_blocks_1d maximum number of blocks in 1d grid - */ - grid_1d_warp_t(size_t overall_num_elements, - size_t num_threads_per_block, - size_t max_num_blocks_1d) - : block_size(num_threads_per_block), - num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) / - (num_threads_per_block / warp_size()), - max_num_blocks_1d)) - { - RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); - RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, - "num_threads_per_block / warp_size() must be > 0"); - } -}; - -/** - * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping - * elements to blocks. - */ -class grid_1d_block_t { - public: - int const block_size{0}; - int const num_blocks{0}; - - /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - * @param max_num_blocks_1d maximum number of blocks in 1d grid - */ - grid_1d_block_t(size_t overall_num_elements, - size_t num_threads_per_block, - size_t max_num_blocks_1d) - : block_size(num_threads_per_block), - num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) - { - RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); - RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, - "num_threads_per_block / warp_size() must be > 0"); - } -}; - -/** - * @brief Generic copy method for all kinds of transfers - * @tparam Type data type - * @param dst destination pointer - * @param src source pointer - * @param len lenth of the src/dst buffers in terms of number of elements - * @param stream cuda stream - */ -template -void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream) -{ - CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); -} - -/** - * @defgroup Copy Copy methods - * These are here along with the generic 'copy' method in order to improve - * code readability using explicitly specified function names - * @{ - */ -/** performs a host to device copy */ -template -void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream) -{ - copy(d_ptr, h_ptr, len, stream); -} - -/** performs a device to host copy */ -template -void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream) -{ - copy(h_ptr, d_ptr, len, stream); -} - -template -void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream) -{ - CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); -} -/** @} */ - -/** - * @defgroup Debug Utils for debugging host/device buffers - * @{ - */ -template -void print_host_vector(const char* variable_name, - const T* host_mem, - size_t componentsCount, - OutStream& out) -{ - out << variable_name << "=["; - for (size_t i = 0; i < componentsCount; ++i) { - if (i != 0) out << ","; - out << host_mem[i]; - } - out << "];" << std::endl; -} - -template -void print_device_vector(const char* variable_name, - const T* devMem, - size_t componentsCount, - OutStream& out) -{ - auto host_mem = std::make_unique(componentsCount); - CUDA_CHECK( - cudaMemcpy(host_mem.get(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost)); - print_host_vector(variable_name, host_mem.get(), componentsCount, out); -} - -/** - * @brief Print an array given a device or a host pointer. - * - * @param[in] variable_name - * @param[in] ptr any pointer (device/host/managed, etc) - * @param[in] componentsCount array length - * @param out the output stream - */ -template -void print_vector(const char* variable_name, const T* ptr, size_t componentsCount, OutStream& out) -{ - cudaPointerAttributes attr; - RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr)); - if (attr.hostPointer != nullptr) { - print_host_vector(variable_name, reinterpret_cast(attr.hostPointer), componentsCount, out); - } else if (attr.type == cudaMemoryTypeUnregistered) { - print_host_vector(variable_name, ptr, componentsCount, out); - } else { - print_device_vector(variable_name, ptr, componentsCount, out); - } -} -/** @} */ - -/** helper method to get max usable shared mem per block parameter */ -inline int getSharedMemPerBlock() -{ - int devId; - RAFT_CUDA_TRY(cudaGetDevice(&devId)); - int smemPerBlk; - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId)); - return smemPerBlk; -} - -/** helper method to get multi-processor count parameter */ -inline int getMultiProcessorCount() -{ - int devId; - RAFT_CUDA_TRY(cudaGetDevice(&devId)); - int mpCount; - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); - return mpCount; -} - -/** helper method to convert an array on device to a string on host */ -template -std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4) -{ - std::stringstream ss; - - T* arr_h = (T*)malloc(size * sizeof(T)); - update_host(arr_h, arr, size, stream); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - - ss << name << " = [ "; - for (int i = 0; i < size; i++) { - ss << std::setw(width) << arr_h[i]; - - if (i < size - 1) ss << ", "; - } - ss << " ]" << std::endl; - - free(arr_h); - - return ss.str(); -} - -/** this seems to be unused, but may be useful in the future */ -template -void ASSERT_DEVICE_MEM(T* ptr, std::string name) -{ - cudaPointerAttributes s_att; - cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); - - if (s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device - << ", err=" << s_err << std::endl; -} - -inline uint32_t curTimeMillis() -{ - auto now = std::chrono::high_resolution_clock::now(); - auto duration = now.time_since_epoch(); - return std::chrono::duration_cast(duration).count(); -} - -/** Helper function to calculate need memory for allocate to store dense matrix. - * @param rows number of rows in matrix - * @param columns number of columns in matrix - * @return need number of items to allocate via allocate() - * @sa allocate() - */ -inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; } - -/** Helper function to check alignment of pointer. - * @param ptr the pointer to check - * @param alignment to be checked for - * @return true if address in bytes is a multiple of alignment - */ -template -bool is_aligned(Type* ptr, size_t alignment) -{ - return reinterpret_cast(ptr) % alignment == 0; -} - -/** calculate greatest common divisor of two numbers - * @a integer - * @b integer - * @ return gcd of a and b - */ -template -IntType gcd(IntType a, IntType b) -{ - while (b != 0) { - IntType tmp = b; - b = a % b; - a = tmp; - } - return a; -} - -template -constexpr T lower_bound() -{ - if constexpr (std::numeric_limits::has_infinity && std::numeric_limits::is_signed) { - return -std::numeric_limits::infinity(); - } - return std::numeric_limits::lowest(); -} - -template -constexpr T upper_bound() -{ - if constexpr (std::numeric_limits::has_infinity) { return std::numeric_limits::infinity(); } - return std::numeric_limits::max(); -} - -/** - * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned - * unique pointer. - * - * This function is useful in the code where multiple repeated allocations/deallocations are - * expected. - * Use case example: - * @code{.cpp} - * void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) { - * auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float)); - * if (pool_guard){ - * RAFT_LOG_INFO("Created a pool %zu bytes", pool_guard->pool_size()); - * } else { - * RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource"); - * } - * rmm::device_uvector x(n, stream, mr); - * rmm::device_uvector y(n, stream, mr); - * ... - * } - * @endcode - * Here, the new memory resource would be created within the function scope if the passed `mr` is - * null and the default resource is not a pool. After the call, `mr` contains a valid memory - * resource in any case. - * - * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it - * into a `pool_memory_resource` if neccessary and return the pointer to the result. - * @param initial_size if a new memory pool is created, this would be its initial size (rounded up - * to 256 bytes). - * - * @return if a new memory pool is created, it returns a unique_ptr to it; - * this managed pointer controls the lifetime of the created memory resource. - */ -inline auto get_pool_memory_resource(rmm::mr::device_memory_resource*& mr, size_t initial_size) -{ - using pool_res_t = rmm::mr::pool_memory_resource; - std::unique_ptr pool_res{}; - if (mr) return pool_res; - mr = rmm::mr::get_current_device_resource(); - if (!dynamic_cast(mr) && - !dynamic_cast*>(mr) && - !dynamic_cast*>(mr)) { - pool_res = std::make_unique(mr, (initial_size + 255) & (~255)); - mr = pool_res.get(); - } - return pool_res; -} - -} // namespace raft - -#endif +#include diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp index b41927f5fb..505485e6a0 100644 --- a/cpp/include/raft/core/cusolver_macros.hpp +++ b/cpp/include/raft/core/cusolver_macros.hpp @@ -23,7 +23,7 @@ #include ///@todo: enable this once logging is enabled //#include -#include +#include #include #define _CUSOLVER_ERR_TO_STR(err) \ diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp index 10c7e8836c..cf5195582b 100644 --- a/cpp/include/raft/core/cusparse_macros.hpp +++ b/cpp/include/raft/core/cusparse_macros.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include ///@todo: enable this once logging is enabled //#include diff --git a/cpp/include/raft/common/detail/callback_sink.hpp b/cpp/include/raft/core/detail/callback_sink.hpp similarity index 100% rename from cpp/include/raft/common/detail/callback_sink.hpp rename to cpp/include/raft/core/detail/callback_sink.hpp diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/core/detail/device_mdarray.hpp similarity index 52% rename from cpp/include/raft/detail/mdarray.hpp rename to cpp/include/raft/core/detail/device_mdarray.hpp index 48094e3ccf..ff7c31000d 100644 --- a/cpp/include/raft/detail/mdarray.hpp +++ b/cpp/include/raft/core/detail/device_mdarray.hpp @@ -21,9 +21,12 @@ * limitations under the License. */ #pragma once -#include -#include -#include // dynamic_extent +#include +#include +#include + +#include +#include // dynamic_extent #include #include @@ -187,153 +190,4 @@ class device_uvector_policy { [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } }; -/** - * @brief A container policy for host mdarray. - */ -template > -class host_vector_policy { - public: - using element_type = ElementType; - using container_type = std::vector; - using allocator_type = typename container_type::allocator_type; - using pointer = typename container_type::pointer; - using const_pointer = typename container_type::const_pointer; - using reference = element_type&; - using const_reference = element_type const&; - using accessor_policy = std::experimental::default_accessor; - using const_accessor_policy = std::experimental::default_accessor; - - public: - auto create(size_t n) -> container_type { return container_type(n); } - - constexpr host_vector_policy() noexcept(std::is_nothrow_default_constructible_v) = - default; - explicit constexpr host_vector_policy(rmm::cuda_stream_view) noexcept( - std::is_nothrow_default_constructible_v) - : host_vector_policy() - { - } - - [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference - { - return c[n]; - } - [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept - -> const_reference - { - return c[n]; - } - - [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } - [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } -}; - -/** - * @brief A mixin to distinguish host and device memory. - */ -template -struct accessor_mixin : public AccessorPolicy { - using accessor_type = AccessorPolicy; - using is_host_type = std::conditional_t; - // make sure the explicit ctor can fall through - using AccessorPolicy::AccessorPolicy; - using offset_policy = accessor_mixin; - accessor_mixin(AccessorPolicy const& that) : AccessorPolicy{that} {} // NOLINT -}; - -template -using host_accessor = accessor_mixin; - -template -using device_accessor = accessor_mixin; - -namespace stdex = std::experimental; - -template -using vector_extent = stdex::extents; - -template -using matrix_extent = stdex::extents; - -template -using scalar_extent = stdex::extents; - -template -MDSPAN_INLINE_FUNCTION auto native_popc(T v) -> int32_t -{ - int c = 0; - for (; v != 0; v &= v - 1) { - c++; - } - return c; -} - -MDSPAN_INLINE_FUNCTION auto popc(uint32_t v) -> int32_t -{ -#if defined(__CUDA_ARCH__) - return __popc(v); -#elif defined(__GNUC__) || defined(__clang__) - return __builtin_popcount(v); -#else - return native_popc(v); -#endif // compiler -} - -MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t -{ -#if defined(__CUDA_ARCH__) - return __popcll(v); -#elif defined(__GNUC__) || defined(__clang__) - return __builtin_popcountll(v); -#else - return native_popc(v); -#endif // compiler -} - -template -MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N], std::index_sequence) -{ - return std::make_tuple(arr[Idx]...); -} - -template -MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N]) -{ - return arr_to_tup(arr, std::make_index_sequence{}); -} - -// uint division optimization inspired by the CIndexer in cupy. Division operation is -// slow on both CPU and GPU, especially 64 bit integer. So here we first try to avoid 64 -// bit when the index is smaller, then try to avoid division when it's exp of 2. -template -MDSPAN_INLINE_FUNCTION auto unravel_index_impl(I idx, stdex::extents shape) -{ - constexpr auto kRank = static_cast(shape.rank()); - std::size_t index[shape.rank()]{0}; // NOLINT - static_assert(std::is_signed::value, - "Don't change the type without changing the for loop."); - for (int32_t dim = kRank; --dim > 0;) { - auto s = static_cast>>(shape.extent(dim)); - if (s & (s - 1)) { - auto t = idx / s; - index[dim] = idx - t * s; - idx = t; - } else { // exp of 2 - index[dim] = idx & (s - 1); - idx >>= popc(s - 1); - } - } - index[0] = idx; - return arr_to_tup(index); -} - -/** - * Ensure all types listed in the parameter pack `Extents` are integral types. - * Usage: - * put it as the last nameless template parameter of a function: - * `typename = ensure_integral_extents` - */ -template -using ensure_integral_extents = std::enable_if_t...>>; - } // namespace raft::detail diff --git a/cpp/include/raft/core/detail/host_device_accessor.hpp b/cpp/include/raft/core/detail/host_device_accessor.hpp new file mode 100644 index 0000000000..3a71e6366b --- /dev/null +++ b/cpp/include/raft/core/detail/host_device_accessor.hpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft::detail { + +/** + * @brief A mixin to distinguish host and device memory. + */ +template +struct host_device_accessor : public AccessorPolicy { + using accessor_type = AccessorPolicy; + using is_host_type = std::conditional_t; + using is_device_type = std::conditional_t; + using is_managed_type = std::conditional_t; + static constexpr bool is_host_accessible = is_host; + static constexpr bool is_device_accessible = is_device; + static constexpr bool is_managed_accessible = is_device && is_host; + // make sure the explicit ctor can fall through + using AccessorPolicy::AccessorPolicy; + using offset_policy = host_device_accessor; + host_device_accessor(AccessorPolicy const& that) : AccessorPolicy{that} {} // NOLINT +}; + +} // namespace raft::detail diff --git a/cpp/include/raft/core/detail/host_mdarray.hpp b/cpp/include/raft/core/detail/host_mdarray.hpp new file mode 100644 index 0000000000..74bd55e78c --- /dev/null +++ b/cpp/include/raft/core/detail/host_mdarray.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (2019) Sandia Corporation + * + * The source code is licensed under the 3-clause BSD license found in the LICENSE file + * thirdparty/LICENSES/mdarray.license + */ + +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include + +namespace raft::detail { + +/** + * @brief A container policy for host mdarray. + */ +template > +class host_vector_policy { + public: + using element_type = ElementType; + using container_type = std::vector; + using allocator_type = typename container_type::allocator_type; + using pointer = typename container_type::pointer; + using const_pointer = typename container_type::const_pointer; + using reference = element_type&; + using const_reference = element_type const&; + using accessor_policy = std::experimental::default_accessor; + using const_accessor_policy = std::experimental::default_accessor; + + public: + auto create(size_t n) -> container_type { return container_type(n); } + + constexpr host_vector_policy() noexcept(std::is_nothrow_default_constructible_v) = + default; + explicit constexpr host_vector_policy(rmm::cuda_stream_view) noexcept( + std::is_nothrow_default_constructible_v) + : host_vector_policy() + { + } + + [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference + { + return c[n]; + } + [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept + -> const_reference + { + return c[n]; + } + + [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } + [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } +}; +} // namespace raft::detail diff --git a/cpp/include/raft/common/detail/logger.hpp b/cpp/include/raft/core/detail/logger.hpp similarity index 100% rename from cpp/include/raft/common/detail/logger.hpp rename to cpp/include/raft/core/detail/logger.hpp diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp new file mode 100644 index 0000000000..66b67579fc --- /dev/null +++ b/cpp/include/raft/core/detail/macros.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef _RAFT_HAS_CUDA +#if defined(__CUDACC__) +#define _RAFT_HAS_CUDA __CUDACC__ +#endif +#endif + +#ifndef _RAFT_HOST_DEVICE +#if defined(_RAFT_HAS_CUDA) +#define _RAFT_DEVICE __device__ +#define _RAFT_HOST __host__ +#define _RAFT_FORCEINLINE __forceinline__ +#else +#define _RAFT_DEVICE +#define _RAFT_HOST +#define _RAFT_FORCEINLINE inline +#endif +#endif + +#define _RAFT_HOST_DEVICE _RAFT_HOST _RAFT_DEVICE + +#ifndef RAFT_INLINE_FUNCTION +#define RAFT_INLINE_FUNCTION _RAFT_FORCEINLINE _RAFT_HOST_DEVICE +#endif + +/** + * Some macro magic to remove optional parentheses of a macro argument. + * See https://stackoverflow.com/a/62984543 + */ +#ifndef RAFT_DEPAREN_MAGICRAFT_DEPAREN_H1 +#define RAFT_DEPAREN(X) RAFT_DEPAREN_H2(RAFT_DEPAREN_H1 X) +#define RAFT_DEPAREN_H1(...) RAFT_DEPAREN_H1 __VA_ARGS__ +#define RAFT_DEPAREN_H2(...) RAFT_DEPAREN_H3(__VA_ARGS__) +#define RAFT_DEPAREN_H3(...) RAFT_DEPAREN_MAGIC##__VA_ARGS__ +#define RAFT_DEPAREN_MAGICRAFT_DEPAREN_H1 +#endif + +#ifndef RAFT_STRINGIFY +#define RAFT_STRINGIFY_DETAIL(...) #__VA_ARGS__ +#define RAFT_STRINGIFY(...) RAFT_STRINGIFY_DETAIL(__VA_ARGS__) +#endif diff --git a/cpp/include/raft/core/detail/mdspan_util.cuh b/cpp/include/raft/core/detail/mdspan_util.cuh new file mode 100644 index 0000000000..6b2c90abcc --- /dev/null +++ b/cpp/include/raft/core/detail/mdspan_util.cuh @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include +#include + +namespace raft::detail { + +template +MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N], std::index_sequence) +{ + return std::make_tuple(arr[Idx]...); +} + +template +MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N]) +{ + return arr_to_tup(arr, std::make_index_sequence{}); +} + +template +MDSPAN_INLINE_FUNCTION auto native_popc(T v) -> int32_t +{ + int c = 0; + for (; v != 0; v &= v - 1) { + c++; + } + return c; +} + +MDSPAN_INLINE_FUNCTION auto popc(uint32_t v) -> int32_t +{ +#if defined(__CUDA_ARCH__) + return __popc(v); +#elif defined(__GNUC__) || defined(__clang__) + return __builtin_popcount(v); +#else + return native_popc(v); +#endif // compiler +} + +MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t +{ +#if defined(__CUDA_ARCH__) + return __popcll(v); +#elif defined(__GNUC__) || defined(__clang__) + return __builtin_popcountll(v); +#else + return native_popc(v); +#endif // compiler +} + +} // end namespace raft::detail \ No newline at end of file diff --git a/cpp/include/raft/common/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp similarity index 100% rename from cpp/include/raft/common/detail/nvtx.hpp rename to cpp/include/raft/core/detail/nvtx.hpp diff --git a/cpp/include/raft/detail/span.hpp b/cpp/include/raft/core/detail/span.hpp similarity index 88% rename from cpp/include/raft/detail/span.hpp rename to cpp/include/raft/core/detail/span.hpp index 555b47dcae..20500d618b 100644 --- a/cpp/include/raft/detail/span.hpp +++ b/cpp/include/raft/core/detail/span.hpp @@ -16,12 +16,11 @@ #pragma once #include // numeric_limits -#include -#include // __host__ __device__ +#include +#include #include namespace raft { -constexpr std::size_t dynamic_extent = std::experimental::dynamic_extent; template class span; @@ -75,10 +74,10 @@ struct is_span_t : public is_span_oracle_t::type> { }; template -__host__ __device__ constexpr auto lexicographical_compare(InputIt1 first1, - InputIt1 last1, - InputIt2 first2, - InputIt2 last2) -> bool +_RAFT_HOST_DEVICE constexpr auto lexicographical_compare(InputIt1 first1, + InputIt1 last1, + InputIt2 first2, + InputIt2 last2) -> bool { Compare comp; for (; first1 != last1 && first2 != last2; ++first1, ++first2) { diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp new file mode 100644 index 0000000000..1c17b5bcb9 --- /dev/null +++ b/cpp/include/raft/core/device_mdarray.hpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { + +/** + * @brief mdarray with device container policy + * @tparam ElementType the data type of the elements + * @tparam Extents defines the shape + * @tparam LayoutPolicy policy for indexing strides and layout ordering + * @tparam ContainerPolicy storage and accessor policy + */ +template > +using device_mdarray = + mdarray>; + +/** + * @brief Shorthand for 0-dim host mdarray (scalar). + * @tparam ElementType the data type of the scalar element + * @tparam IndexType the index type of the extents + */ +template +using device_scalar = device_mdarray>; + +/** + * @brief Shorthand for 1-dim device mdarray. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + */ +template +using device_vector = device_mdarray, LayoutPolicy>; + +/** + * @brief Shorthand for c-contiguous device matrix. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + */ +template +using device_matrix = device_mdarray, LayoutPolicy>; + +/** + * @brief Create a device mdarray. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param handle raft::handle_t + * @param exts dimensionality of the array (series of integers) + * @return raft::device_mdarray + */ +template +auto make_device_mdarray(const raft::handle_t& handle, extents exts) +{ + using mdarray_t = device_mdarray; + + typename mdarray_t::mapping_type layout{exts}; + typename mdarray_t::container_policy_type policy{handle.get_stream()}; + + return mdarray_t{layout, policy}; +} + +/** + * @brief Create a device mdarray. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param handle raft::handle_t + * @param mr rmm memory resource used for allocating the memory for the array + * @param exts dimensionality of the array (series of integers) + * @return raft::device_mdarray + */ +template +auto make_device_mdarray(const raft::handle_t& handle, + rmm::mr::device_memory_resource* mr, + extents exts) +{ + using mdarray_t = device_mdarray; + + typename mdarray_t::mapping_type layout{exts}; + typename mdarray_t::container_policy_type policy{handle.get_stream(), mr}; + + return mdarray_t{layout, policy}; +} + +/** + * @brief Create a 2-dim c-contiguous device mdarray. + * + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param[in] handle raft handle for managing expensive resources + * @param[in] n_rows number or rows in matrix + * @param[in] n_cols number of columns in matrix + * @return raft::device_matrix + */ +template +auto make_device_matrix(raft::handle_t const& handle, IndexType n_rows, IndexType n_cols) +{ + return make_device_mdarray( + handle.get_stream(), make_extents(n_rows, n_cols)); +} + +/** + * @brief Create a device scalar from v. + * + * @tparam ElementType the data type of the scalar element + * @tparam IndexType the index type of the extents + * @param[in] handle raft handle for managing expensive cuda resources + * @param[in] v scalar to wrap on device + * @return raft::device_scalar + */ +template +auto make_device_scalar(raft::handle_t const& handle, ElementType const& v) +{ + scalar_extent extents; + using policy_t = typename device_scalar::container_policy_type; + policy_t policy{handle.get_stream()}; + auto scalar = device_scalar{extents, policy}; + scalar(0) = v; + return scalar; +} + +/** + * @brief Create a 1-dim device mdarray. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param[in] handle raft handle for managing expensive cuda resources + * @param[in] n number of elements in vector + * @return raft::device_vector + */ +template +auto make_device_vector(raft::handle_t const& handle, IndexType n) +{ + return make_device_mdarray(handle.get_stream(), + make_extents(n)); +} + +} // end namespace raft diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp new file mode 100644 index 0000000000..2fc43e2a05 --- /dev/null +++ b/cpp/include/raft/core/device_mdspan.hpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { + +template +using device_accessor = detail::host_device_accessor; + +template +using managed_accessor = detail::host_device_accessor; + +/** + * @brief std::experimental::mdspan with device tag to avoid accessing incorrect memory location. + */ +template > +using device_mdspan = mdspan>; + +template > +using managed_mdspan = mdspan>; + +namespace detail { +template +struct is_device_mdspan : std::false_type { +}; +template +struct is_device_mdspan : std::bool_constant { +}; + +/** + * @\brief Boolean to determine if template type T is either raft::device_mdspan or a derived type + */ +template +using is_device_mdspan_t = is_device_mdspan>; + +template +using is_input_device_mdspan_t = is_device_mdspan>; + +template +using is_output_device_mdspan_t = is_device_mdspan>; + +template +struct is_managed_mdspan : std::false_type { +}; +template +struct is_managed_mdspan : std::bool_constant { +}; + +/** + * @\brief Boolean to determine if template type T is either raft::managed_mdspan or a derived type + */ +template +using is_managed_mdspan_t = is_managed_mdspan>; + +template +using is_input_managed_mdspan_t = is_managed_mdspan>; + +template +using is_output_managed_mdspan_t = is_managed_mdspan>; + +} // end namespace detail + +/** + * @\brief Boolean to determine if variadic template types Tn are either raft::device_mdspan or a + * derived type + */ +template +inline constexpr bool is_device_mdspan_v = std::conjunction_v...>; + +template +inline constexpr bool is_input_device_mdspan_v = + std::conjunction_v...>; + +template +inline constexpr bool is_output_device_mdspan_v = + std::conjunction_v...>; + +template +using enable_if_device_mdspan = std::enable_if_t>; + +template +using enable_if_input_device_mdspan = std::enable_if_t>; + +template +using enable_if_output_device_mdspan = std::enable_if_t>; + +/** + * @\brief Boolean to determine if variadic template types Tn are either raft::managed_mdspan or a + * derived type + */ +template +inline constexpr bool is_managed_mdspan_v = std::conjunction_v...>; + +template +inline constexpr bool is_input_managed_mdspan_v = + std::conjunction_v...>; + +template +inline constexpr bool is_output_managed_mdspan_v = + std::conjunction_v...>; + +template +using enable_if_managed_mdspan = std::enable_if_t>; + +template +using enable_if_input_managed_mdspan = std::enable_if_t>; + +template +using enable_if_output_managed_mdspan = std::enable_if_t>; + +/** + * @brief Shorthand for 0-dim host mdspan (scalar). + * @tparam ElementType the data type of the scalar element + * @tparam IndexType the index type of the extents + */ +template +using device_scalar_view = device_mdspan>; + +/** + * @brief Shorthand for 1-dim device mdspan. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + */ +template +using device_vector_view = device_mdspan, LayoutPolicy>; + +/** + * @brief Shorthand for c-contiguous device matrix view. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + */ +template +using device_matrix_view = device_mdspan, LayoutPolicy>; + +/** + * @brief Create a raft::managed_mdspan + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param ptr Pointer to the data + * @param exts dimensionality of the array (series of integers) + * @return raft::managed_mdspan + */ +template +auto make_managed_mdspan(ElementType* ptr, extents exts) +{ + return make_mdspan(ptr, exts); +} + +/** + * @brief Create a 0-dim (scalar) mdspan instance for device value. + * + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @param[in] ptr on device to wrap + */ +template +auto make_device_scalar_view(ElementType* ptr) +{ + scalar_extent extents; + return device_scalar_view{ptr, extents}; +} + +/** + * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's + * expected that the given layout policy match the layout of the underlying + * pointer. + * @tparam ElementType the data type of the matrix elements + * @tparam LayoutPolicy policy for strides and layout ordering + * @tparam IndexType the index type of the extents + * @param[in] ptr on device to wrap + * @param[in] n_rows number of rows in pointer + * @param[in] n_cols number of columns in pointer + */ +template +auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) +{ + matrix_extent extents{n_rows, n_cols}; + return device_matrix_view{ptr, extents}; +} + +/** + * @brief Create a 1-dim mdspan instance for device pointer. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param[in] ptr on device to wrap + * @param[in] n number of elements in pointer + * @return raft::device_vector_view + */ +template +auto make_device_vector_view(ElementType* ptr, IndexType n) +{ + return device_vector_view{ptr, n}; +} + +} // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/device_span.hpp b/cpp/include/raft/core/device_span.hpp new file mode 100644 index 0000000000..0730b20bfb --- /dev/null +++ b/cpp/include/raft/core/device_span.hpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { + +/** + * @brief A span class for device pointer. + */ +template +using device_span = span; + +} // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/host_mdarray.hpp b/cpp/include/raft/core/host_mdarray.hpp new file mode 100644 index 0000000000..6221ca59f0 --- /dev/null +++ b/cpp/include/raft/core/host_mdarray.hpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace raft { +/** + * @brief mdarray with host container policy + * @tparam ElementType the data type of the elements + * @tparam Extents defines the shape + * @tparam LayoutPolicy policy for indexing strides and layout ordering + * @tparam ContainerPolicy storage and accessor policy + */ +template > +using host_mdarray = mdarray>; + +/** + * @brief Shorthand for 0-dim host mdarray (scalar). + * @tparam ElementType the data type of the scalar element + * @tparam IndexType the index type of the extents + */ +template +using host_scalar = host_mdarray>; + +/** + * @brief Shorthand for 1-dim host mdarray. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + */ +template +using host_vector = host_mdarray, LayoutPolicy>; + +/** + * @brief Shorthand for c-contiguous host matrix. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + */ +template +using host_matrix = host_mdarray, LayoutPolicy>; + +/** + * @brief Create a host mdarray. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param exts dimensionality of the array (series of integers) + * @return raft::host_mdarray + */ +template +auto make_host_mdarray(extents exts) +{ + using mdarray_t = host_mdarray; + + typename mdarray_t::mapping_type layout{exts}; + typename mdarray_t::container_policy_type policy; + + return mdarray_t{layout, policy}; +} + +/** + * @brief Create a 2-dim c-contiguous host mdarray. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param[in] n_rows number or rows in matrix + * @param[in] n_cols number of columns in matrix + * @return raft::host_matrix + */ +template +auto make_host_matrix(IndexType n_rows, IndexType n_cols) +{ + return make_host_mdarray( + make_extents(n_rows, n_cols)); +} + +/** + * @brief Create a host scalar from v. + * + * @tparam ElementType the data type of the scalar element + * @tparam IndexType the index type of the extents + * @param[in] v scalar type to wrap + * @return raft::host_scalar + */ +template +auto make_host_scalar(ElementType const& v) +{ + // FIXME(jiamingy): We can optimize this by using std::array as container policy, which + // requires some more compile time dispatching. This is enabled in the ref impl but + // hasn't been ported here yet. + scalar_extent extents; + using policy_t = typename host_scalar::container_policy_type; + policy_t policy; + auto scalar = host_scalar{extents, policy}; + scalar(0) = v; + return scalar; +} + +/** + * @brief Create a 1-dim host mdarray. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param[in] n number of elements in vector + * @return raft::host_vector + */ +template +auto make_host_vector(IndexType n) +{ + return make_host_mdarray(make_extents(n)); +} + +} // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp new file mode 100644 index 0000000000..fc2a9bbd6d --- /dev/null +++ b/cpp/include/raft/core/host_mdspan.hpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace raft { + +template +using host_accessor = detail::host_device_accessor; + +/** + * @brief std::experimental::mdspan with host tag to avoid accessing incorrect memory location. + */ +template > +using host_mdspan = mdspan>; + +namespace detail { + +template +struct is_host_mdspan : std::false_type { +}; +template +struct is_host_mdspan : std::bool_constant { +}; + +/** + * @\brief Boolean to determine if template type T is either raft::host_mdspan or a derived type + */ +template +using is_host_mdspan_t = is_host_mdspan>; + +template +using is_input_host_mdspan_t = is_host_mdspan>; + +template +using is_output_host_mdspan_t = is_host_mdspan>; + +} // namespace detail + +/** + * @\brief Boolean to determine if variadic template types Tn are either raft::host_mdspan or a + * derived type + */ +template +inline constexpr bool is_host_mdspan_v = std::conjunction_v...>; + +template +inline constexpr bool is_input_host_mdspan_v = + std::conjunction_v...>; + +template +inline constexpr bool is_output_host_mdspan_v = + std::conjunction_v...>; + +template +using enable_if_host_mdspan = std::enable_if_t>; + +template +using enable_if_input_host_mdspan = std::enable_if_t>; + +template +using enable_if_output_host_mdspan = std::enable_if_t>; + +/** + * @brief Shorthand for 0-dim host mdspan (scalar). + * @tparam ElementType the data type of the scalar element + * @tparam IndexType the index type of the extents + */ +template +using host_scalar_view = host_mdspan>; + +/** + * @brief Shorthand for 1-dim host mdspan. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + */ +template +using host_vector_view = host_mdspan, LayoutPolicy>; + +/** + * @brief Shorthand for c-contiguous host matrix view. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + */ +template +using host_matrix_view = host_mdspan, LayoutPolicy>; + +/** + * @brief Create a 0-dim (scalar) mdspan instance for host value. + * + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @param[in] ptr on device to wrap + */ +template +auto make_host_scalar_view(ElementType* ptr) +{ + scalar_extent extents; + return host_scalar_view{ptr, extents}; +} + +/** + * @brief Create a 2-dim c-contiguous mdspan instance for host pointer. It's + * expected that the given layout policy match the layout of the underlying + * pointer. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param[in] ptr on host to wrap + * @param[in] n_rows number of rows in pointer + * @param[in] n_cols number of columns in pointer + */ +template +auto make_host_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) +{ + matrix_extent extents{n_rows, n_cols}; + return host_matrix_view{ptr, extents}; +} + +/** + * @brief Create a 1-dim mdspan instance for host pointer. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @param[in] ptr on host to wrap + * @param[in] n number of elements in pointer + * @return raft::host_vector_view + */ +template +auto make_host_vector_view(ElementType* ptr, IndexType n) +{ + return host_vector_view{ptr, n}; +} +} // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/host_span.hpp b/cpp/include/raft/core/host_span.hpp new file mode 100644 index 0000000000..3cad62b7cd --- /dev/null +++ b/cpp/include/raft/core/host_span.hpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +/** + * @brief A span class for host pointer. + */ +template +using host_span = span; + +} // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/interruptible.hpp b/cpp/include/raft/core/interruptible.hpp index 55d272739f..76fb7aa7c3 100644 --- a/cpp/include/raft/core/interruptible.hpp +++ b/cpp/include/raft/core/interruptible.hpp @@ -22,8 +22,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp index 22e4dd7a90..44c8263abf 100644 --- a/cpp/include/raft/core/logger.hpp +++ b/cpp/include/raft/core/logger.hpp @@ -31,8 +31,8 @@ #include #define SPDLOG_HEADER_ONLY -#include -#include +#include +#include #include // NOLINT #include // NOLINT diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp index f1e735c4ab..44730d901e 100644 --- a/cpp/include/raft/core/mdarray.hpp +++ b/cpp/include/raft/core/mdarray.hpp @@ -24,206 +24,13 @@ #include -#include +#include +#include #include -#include - +#include #include -#include namespace raft { -/** - * @brief Dimensions extents for raft::host_mdspan or raft::device_mdspan - */ -template -using extents = std::experimental::extents; - -/** - * @defgroup C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory. - * @{ - */ -using detail::stdex::layout_right; -using layout_c_contiguous = layout_right; -using row_major = layout_right; -/** @} */ - -/** - * @defgroup F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory. - * @{ - */ -using detail::stdex::layout_left; -using layout_f_contiguous = layout_left; -using col_major = layout_left; -/** @} */ - -/** - * @brief Strided layout for non-contiguous memory. - */ -using detail::stdex::layout_stride; - -/** - * @defgroup Common mdarray/mdspan extent types. The rank is known at compile time, each dimension - * is known at run time (dynamic_extent in each dimension). - * @{ - */ -using detail::matrix_extent; -using detail::scalar_extent; -using detail::vector_extent; - -template -using extent_1d = vector_extent; - -template -using extent_2d = matrix_extent; - -template -using extent_3d = detail::stdex::extents; - -template -using extent_4d = - detail::stdex::extents; - -template -using extent_5d = detail::stdex::extents; -/** @} */ - -template > -using mdspan = detail::stdex::mdspan; - -namespace detail { -/** - * @\brief Template checks and helpers to determine if type T is an std::mdspan - * or a derived type - */ - -template -void __takes_an_mdspan_ptr(mdspan*); - -template -struct is_mdspan : std::false_type { -}; -template -struct is_mdspan()))>> - : std::true_type { -}; - -template -using is_mdspan_t = is_mdspan>; - -template -inline constexpr bool is_mdspan_v = is_mdspan_t::value; -} // namespace detail - -template -struct is_mdspan : std::true_type { -}; -template -struct is_mdspan : detail::is_mdspan_t { -}; -template -struct is_mdspan - : std::conditional_t, is_mdspan, std::false_type> { -}; - -/** - * @\brief Boolean to determine if variadic template types Tn are either - * raft::host_mdspan/raft::device_mdspan or their derived types - */ -template -inline constexpr bool is_mdspan_v = is_mdspan::value; - -/** - * @brief stdex::mdspan with device tag to avoid accessing incorrect memory location. - */ -template > -using device_mdspan = - mdspan>; - -/** - * @brief stdex::mdspan with host tag to avoid accessing incorrect memory location. - */ -template > -using host_mdspan = - mdspan>; - -namespace detail { -template -struct is_device_mdspan : std::false_type { -}; -template -struct is_device_mdspan : std::bool_constant { -}; - -/** - * @\brief Boolean to determine if template type T is either raft::device_mdspan or a derived type - */ -template -inline constexpr bool is_device_mdspan_v = is_device_mdspan>::value; - -template -struct is_host_mdspan : std::false_type { -}; -template -struct is_host_mdspan : T::accessor_type::is_host_type { -}; - -/** - * @\brief Boolean to determine if template type T is either raft::host_mdspan or a derived type - */ -template -inline constexpr bool is_host_mdspan_v = is_host_mdspan>::value; -} // namespace detail - -template -struct is_device_mdspan : std::true_type { -}; -template -struct is_device_mdspan : detail::is_device_mdspan> { -}; -template -struct is_device_mdspan - : std::conditional_t, is_device_mdspan, std::false_type> { -}; - -/** - * @\brief Boolean to determine if variadic template types Tn are either raft::device_mdspan or a - * derived type - */ -template -inline constexpr bool is_device_mdspan_v = is_device_mdspan::value; - -template -struct is_host_mdspan : std::true_type { -}; -template -struct is_host_mdspan : detail::is_host_mdspan> { -}; -template -struct is_host_mdspan - : std::conditional_t, is_host_mdspan, std::false_type> { -}; - -/** - * @\brief Boolean to determine if variadic template types Tn are either raft::host_mdspan or a - * derived type - */ -template -inline constexpr bool is_host_mdspan_v = is_host_mdspan::value; - /** * @brief Interface to implement an owning multi-dimensional array * @@ -295,7 +102,7 @@ inline constexpr bool is_array_interface_v = is_array_interface::value; * are some inconsistencies in between them. We have made some modificiations to fit our * needs, which are listed below. * - * - Layout policy is different, the mdarray in raft uses `stdex::extent` directly just + * - Layout policy is different, the mdarray in raft uses `std::experimental::extent` directly just * like `mdspan`, while the `mdarray` in the reference implementation uses varidic * template. * @@ -348,9 +155,12 @@ class mdarray typename container_policy_type::const_accessor_policy, typename container_policy_type::accessor_policy>> using view_type_impl = - std::conditional_t, - device_mdspan>; + mdspan>; public: /** @@ -456,61 +266,61 @@ class mdarray } // basic_mdarray observers of the domain multidimensional index space (also in basic_mdspan) - [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank() noexcept -> rank_type + [[nodiscard]] RAFT_INLINE_FUNCTION static constexpr auto rank() noexcept -> rank_type { return extents_type::rank(); } - [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank_dynamic() noexcept -> rank_type + [[nodiscard]] RAFT_INLINE_FUNCTION static constexpr auto rank_dynamic() noexcept -> rank_type { return extents_type::rank_dynamic(); } - [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto static_extent(size_t r) noexcept + [[nodiscard]] RAFT_INLINE_FUNCTION static constexpr auto static_extent(size_t r) noexcept -> index_type { return extents_type::static_extent(r); } - [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extents() const noexcept -> extents_type + [[nodiscard]] RAFT_INLINE_FUNCTION constexpr auto extents() const noexcept -> extents_type { return map_.extents(); } /** * @brief the extent of rank r */ - [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extent(size_t r) const noexcept -> index_type + [[nodiscard]] RAFT_INLINE_FUNCTION constexpr auto extent(size_t r) const noexcept -> index_type { return map_.extents().extent(r); } // mapping - [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto mapping() const noexcept -> mapping_type + [[nodiscard]] RAFT_INLINE_FUNCTION constexpr auto mapping() const noexcept -> mapping_type { return map_; } - [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_unique() const noexcept -> bool + [[nodiscard]] RAFT_INLINE_FUNCTION constexpr auto is_unique() const noexcept -> bool { return map_.is_unique(); } - [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_exhaustive() const noexcept -> bool + [[nodiscard]] RAFT_INLINE_FUNCTION constexpr auto is_exhaustive() const noexcept -> bool { return map_.is_exhaustive(); } - [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_strided() const noexcept -> bool + [[nodiscard]] RAFT_INLINE_FUNCTION constexpr auto is_strided() const noexcept -> bool { return map_.is_strided(); } - [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto stride(size_t r) const -> index_type + [[nodiscard]] RAFT_INLINE_FUNCTION constexpr auto stride(size_t r) const -> index_type { return map_.stride(r); } - [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_unique() noexcept -> bool + [[nodiscard]] RAFT_INLINE_FUNCTION static constexpr auto is_always_unique() noexcept -> bool { return mapping_type::is_always_unique(); } - [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_exhaustive() noexcept -> bool + [[nodiscard]] RAFT_INLINE_FUNCTION static constexpr auto is_always_exhaustive() noexcept -> bool { return mapping_type::is_always_exhaustive(); } - [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_strided() noexcept -> bool + [[nodiscard]] RAFT_INLINE_FUNCTION static constexpr auto is_always_strided() noexcept -> bool { return mapping_type::is_always_strided(); } @@ -525,477 +335,6 @@ class mdarray container_type c_; }; -/** - * @brief mdarray with host container policy - * @tparam ElementType the data type of the elements - * @tparam Extents defines the shape - * @tparam LayoutPolicy policy for indexing strides and layout ordering - * @tparam ContainerPolicy storage and accessor policy - */ -template > -using host_mdarray = - mdarray>; - -/** - * @brief mdarray with device container policy - * @tparam ElementType the data type of the elements - * @tparam Extents defines the shape - * @tparam LayoutPolicy policy for indexing strides and layout ordering - * @tparam ContainerPolicy storage and accessor policy - */ -template > -using device_mdarray = - mdarray>; - -/** - * @brief Shorthand for 0-dim host mdarray (scalar). - * @tparam ElementType the data type of the scalar element - * @tparam IndexType the index type of the extents - */ -template -using host_scalar = host_mdarray>; - -/** - * @brief Shorthand for 0-dim host mdarray (scalar). - * @tparam ElementType the data type of the scalar element - * @tparam IndexType the index type of the extents - */ -template -using device_scalar = device_mdarray>; - -/** - * @brief Shorthand for 1-dim host mdarray. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - */ -template -using host_vector = host_mdarray, LayoutPolicy>; - -/** - * @brief Shorthand for 1-dim device mdarray. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - */ -template -using device_vector = device_mdarray, LayoutPolicy>; - -/** - * @brief Shorthand for c-contiguous host matrix. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - */ -template -using host_matrix = host_mdarray, LayoutPolicy>; - -/** - * @brief Shorthand for c-contiguous device matrix. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - */ -template -using device_matrix = device_mdarray, LayoutPolicy>; - -/** - * @brief Shorthand for 0-dim host mdspan (scalar). - * @tparam ElementType the data type of the scalar element - * @tparam IndexType the index type of the extents - */ -template -using host_scalar_view = host_mdspan>; - -/** - * @brief Shorthand for 0-dim host mdspan (scalar). - * @tparam ElementType the data type of the scalar element - * @tparam IndexType the index type of the extents - */ -template -using device_scalar_view = device_mdspan>; - -/** - * @brief Shorthand for 1-dim host mdspan. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - */ -template -using host_vector_view = host_mdspan, LayoutPolicy>; - -/** - * @brief Shorthand for 1-dim device mdspan. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - */ -template -using device_vector_view = device_mdspan, LayoutPolicy>; - -/** - * @brief Shorthand for c-contiguous host matrix view. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - */ -template -using host_matrix_view = host_mdspan, LayoutPolicy>; - -/** - * @brief Shorthand for c-contiguous device matrix view. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - */ -template -using device_matrix_view = device_mdspan, LayoutPolicy>; - -/** - * @brief Create a 0-dim (scalar) mdspan instance for host value. - * - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @param[in] ptr on device to wrap - */ -template -auto make_host_scalar_view(ElementType* ptr) -{ - scalar_extent extents; - return host_scalar_view{ptr, extents}; -} - -/** - * @brief Create a 0-dim (scalar) mdspan instance for device value. - * - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @param[in] ptr on device to wrap - */ -template -auto make_device_scalar_view(ElementType* ptr) -{ - scalar_extent extents; - return device_scalar_view{ptr, extents}; -} - -/** - * @brief Create a 2-dim c-contiguous mdspan instance for host pointer. It's - * expected that the given layout policy match the layout of the underlying - * pointer. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param[in] ptr on host to wrap - * @param[in] n_rows number of rows in pointer - * @param[in] n_cols number of columns in pointer - */ -template -auto make_host_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) -{ - matrix_extent extents{n_rows, n_cols}; - return host_matrix_view{ptr, extents}; -} -/** - * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's - * expected that the given layout policy match the layout of the underlying - * pointer. - * @tparam ElementType the data type of the matrix elements - * @tparam LayoutPolicy policy for strides and layout ordering - * @tparam IndexType the index type of the extents - * @param[in] ptr on device to wrap - * @param[in] n_rows number of rows in pointer - * @param[in] n_cols number of columns in pointer - */ -template -auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) -{ - matrix_extent extents{n_rows, n_cols}; - return device_matrix_view{ptr, extents}; -} - -/** - * @brief Create a 1-dim mdspan instance for host pointer. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @param[in] ptr on host to wrap - * @param[in] n number of elements in pointer - * @return raft::host_vector_view - */ -template -auto make_host_vector_view(ElementType* ptr, IndexType n) -{ - vector_extent extents{n}; - return host_vector_view{ptr, extents}; -} - -/** - * @brief Create a 1-dim mdspan instance for device pointer. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param[in] ptr on device to wrap - * @param[in] n number of elements in pointer - * @return raft::device_vector_view - */ -template -auto make_device_vector_view(ElementType* ptr, IndexType n) -{ - vector_extent extents{n}; - return device_vector_view{ptr, extents}; -} - -/** - * @brief Create a host mdarray. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param exts dimensionality of the array (series of integers) - * @return raft::host_mdarray - */ -template -auto make_host_mdarray(extents exts) -{ - using mdarray_t = host_mdarray; - - typename mdarray_t::mapping_type layout{exts}; - typename mdarray_t::container_policy_type policy; - - return mdarray_t{layout, policy}; -} - -/** - * @brief Create a device mdarray. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param handle raft::handle_t - * @param exts dimensionality of the array (series of integers) - * @return raft::device_mdarray - */ -template -auto make_device_mdarray(const raft::handle_t& handle, extents exts) -{ - using mdarray_t = device_mdarray; - - typename mdarray_t::mapping_type layout{exts}; - typename mdarray_t::container_policy_type policy{handle.get_stream()}; - - return mdarray_t{layout, policy}; -} - -/** - * @brief Create a device mdarray. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param handle raft::handle_t - * @param mr rmm memory resource used for allocating the memory for the array - * @param exts dimensionality of the array (series of integers) - * @return raft::device_mdarray - */ -template -auto make_device_mdarray(const raft::handle_t& handle, - rmm::mr::device_memory_resource* mr, - extents exts) -{ - using mdarray_t = device_mdarray; - - typename mdarray_t::mapping_type layout{exts}; - typename mdarray_t::container_policy_type policy{handle.get_stream(), mr}; - - return mdarray_t{layout, policy}; -} - -/** - * @brief Create raft::extents to specify dimensionality - * - * @tparam IndexType The type of each dimension of the extents - * @tparam Extents Dimensions (a series of integers) - * @param exts The desired dimensions - * @return raft::extents - */ -template > -auto make_extents(Extents... exts) -{ - return extents{exts...}; -} - -/** - * @brief Create a 2-dim c-contiguous host mdarray. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param[in] n_rows number or rows in matrix - * @param[in] n_cols number of columns in matrix - * @return raft::host_matrix - */ -template -auto make_host_matrix(IndexType n_rows, IndexType n_cols) -{ - return make_host_mdarray( - make_extents(n_rows, n_cols)); -} - -/** - * @brief Create a 2-dim c-contiguous device mdarray. - * - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param[in] handle raft handle for managing expensive resources - * @param[in] n_rows number or rows in matrix - * @param[in] n_cols number of columns in matrix - * @return raft::device_matrix - */ -template -auto make_device_matrix(raft::handle_t const& handle, IndexType n_rows, IndexType n_cols) -{ - return make_device_mdarray( - handle.get_stream(), make_extents(n_rows, n_cols)); -} - -/** - * @brief Create a host scalar from v. - * - * @tparam ElementType the data type of the scalar element - * @tparam IndexType the index type of the extents - * @param[in] v scalar type to wrap - * @return raft::host_scalar - */ -template -auto make_host_scalar(ElementType const& v) -{ - // FIXME(jiamingy): We can optimize this by using std::array as container policy, which - // requires some more compile time dispatching. This is enabled in the ref impl but - // hasn't been ported here yet. - scalar_extent extents; - using policy_t = typename host_scalar::container_policy_type; - policy_t policy; - auto scalar = host_scalar{extents, policy}; - scalar(0) = v; - return scalar; -} - -/** - * @brief Create a device scalar from v. - * - * @tparam ElementType the data type of the scalar element - * @tparam IndexType the index type of the extents - * @param[in] handle raft handle for managing expensive cuda resources - * @param[in] v scalar to wrap on device - * @return raft::device_scalar - */ -template -auto make_device_scalar(raft::handle_t const& handle, ElementType const& v) -{ - scalar_extent extents; - using policy_t = typename device_scalar::container_policy_type; - policy_t policy{handle.get_stream()}; - auto scalar = device_scalar{extents, policy}; - scalar(0) = v; - return scalar; -} - -/** - * @brief Create a 1-dim host mdarray. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param[in] n number of elements in vector - * @return raft::host_vector - */ -template -auto make_host_vector(IndexType n) -{ - return make_host_mdarray(make_extents(n)); -} - -/** - * @brief Create a 1-dim device mdarray. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param[in] handle raft handle for managing expensive cuda resources - * @param[in] n number of elements in vector - * @return raft::device_vector - */ -template -auto make_device_vector(raft::handle_t const& handle, IndexType n) -{ - return make_device_mdarray(handle.get_stream(), - make_extents(n)); -} - -/** - * @brief Flatten raft::host_mdspan or raft::device_mdspan into a 1-dim array view - * - * @tparam mdspan_type Expected type raft::host_mdspan or raft::device_mdspan - * @param mds raft::host_mdspan or raft::device_mdspan object - * @return raft::host_mdspan or raft::device_mdspan with vector_extent - * depending on AccessoryPolicy - */ -template >* = nullptr> -auto flatten(mdspan_type mds) -{ - RAFT_EXPECTS(mds.is_exhaustive(), "Input must be contiguous."); - - vector_extent ext{mds.size()}; - - return detail::stdex::mdspan(mds.data_handle(), ext); -} - /** * @brief Flatten object implementing raft::array_interface into a 1-dim array view * @@ -1011,36 +350,6 @@ auto flatten(const array_interface_type& mda) return flatten(mda.view()); } -/** - * @brief Reshape raft::host_mdspan or raft::device_mdspan - * - * @tparam mdspan_type Expected type raft::host_mdspan or raft::device_mdspan - * @tparam IndexType the index type of the extents - * @tparam Extents raft::extents for dimensions - * @param mds raft::host_mdspan or raft::device_mdspan object - * @param new_shape Desired new shape of the input - * @return raft::host_mdspan or raft::device_mdspan, depending on AccessorPolicy - */ -template >* = nullptr> -auto reshape(mdspan_type mds, extents new_shape) -{ - RAFT_EXPECTS(mds.is_exhaustive(), "Input must be contiguous."); - - size_t new_size = 1; - for (size_t i = 0; i < new_shape.rank(); ++i) { - new_size *= new_shape.extent(i); - } - RAFT_EXPECTS(new_size == mds.size(), "Cannot reshape array with size mismatch"); - - return detail::stdex::mdspan(mds.data_handle(), new_shape); -} - /** * @brief Reshape object implementing raft::array_interface * @@ -1061,36 +370,4 @@ auto reshape(const array_interface_type& mda, extents new return reshape(mda.view(), new_shape); } -/** - * \brief Turns linear index into coordinate. Similar to numpy unravel_index. - * - * \code - * auto m = make_host_matrix(7, 6); - * auto m_v = m.view(); - * auto coord = unravel_index(2, m.extents(), typename decltype(m)::layout_type{}); - * std::apply(m_v, coord) = 2; - * \endcode - * - * \param idx The linear index. - * \param shape The shape of the array to use. - * \param layout Must be `layout_c_contiguous` (row-major) in current implementation. - * - * \return A std::tuple that represents the coordinate. - */ -template -MDSPAN_INLINE_FUNCTION auto unravel_index(Idx idx, - extents shape, - LayoutPolicy const& layout) -{ - static_assert(std::is_same_v>, - layout_c_contiguous>, - "Only C layout is supported."); - static_assert(std::is_integral_v, "Index must be integral."); - auto constexpr kIs64 = sizeof(std::remove_cv_t>) == sizeof(uint64_t); - if (kIs64 && static_cast(idx) > std::numeric_limits::max()) { - return detail::unravel_index_impl(static_cast(idx), shape); - } else { - return detail::unravel_index_impl(static_cast(idx), shape); - } -} } // namespace raft diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp index 809134e96e..a858633e07 100644 --- a/cpp/include/raft/core/mdspan.hpp +++ b/cpp/include/raft/core/mdspan.hpp @@ -1,10 +1,3 @@ -/* - * Copyright (2019) Sandia Corporation - * - * The source code is licensed under the 3-clause BSD license found in the LICENSE file - * thirdparty/LICENSES/mdarray.license - */ - /* * Copyright (c) 2022, NVIDIA CORPORATION. * @@ -22,4 +15,244 @@ */ #pragma once -#include \ No newline at end of file +#include +#include + +#include +#include +#include + +#include + +namespace raft { + +template > +using mdspan = std::experimental::mdspan; + +/** + * Ensure all types listed in the parameter pack `Extents` are integral types. + * Usage: + * put it as the last nameless template parameter of a function: + * `typename = ensure_integral_extents` + */ +template +using ensure_integral_extents = std::enable_if_t...>>; + +/** + * @\brief Template checks and helpers to determine if type T is an std::mdspan + * or a derived type + */ + +template +void __takes_an_mdspan_ptr(mdspan*); + +template +struct is_mdspan : std::false_type { +}; +template +struct is_mdspan()))>> + : std::true_type { +}; + +template +struct is_input_mdspan : std::false_type { +}; +template +struct is_input_mdspan()))>> + : std::bool_constant> { +}; + +template +struct is_output_mdspan : std::false_type { +}; +template +struct is_output_mdspan()))>> + : std::bool_constant> { +}; + +template +using is_mdspan_t = is_mdspan>; + +template +using is_input_mdspan_t = is_input_mdspan; + +template +using is_output_mdspan_t = is_output_mdspan; + +/** + * @\brief Boolean to determine if variadic template types Tn are either + * raft::host_mdspan/raft::device_mdspan or their derived types + */ +template +inline constexpr bool is_mdspan_v = std::conjunction_v...>; + +template +using enable_if_mdspan = std::enable_if_t>; + +template +inline constexpr bool is_input_mdspan_v = std::conjunction_v...>; + +template +using enable_if_input_mdspan = std::enable_if_t>; + +template +inline constexpr bool is_output_mdspan_v = std::conjunction_v...>; + +template +using enable_if_output_mdspan = std::enable_if_t>; + +// uint division optimization inspired by the CIndexer in cupy. Division operation is +// slow on both CPU and GPU, especially 64 bit integer. So here we first try to avoid 64 +// bit when the index is smaller, then try to avoid division when it's exp of 2. +template +RAFT_INLINE_FUNCTION auto unravel_index_impl( + I idx, std::experimental::extents shape) +{ + constexpr auto kRank = static_cast(shape.rank()); + std::size_t index[shape.rank()]{0}; // NOLINT + static_assert(std::is_signed::value, + "Don't change the type without changing the for loop."); + for (int32_t dim = kRank; --dim > 0;) { + auto s = static_cast>>(shape.extent(dim)); + if (s & (s - 1)) { + auto t = idx / s; + index[dim] = idx - t * s; + idx = t; + } else { // exp of 2 + index[dim] = idx & (s - 1); + idx >>= detail::popc(s - 1); + } + } + index[0] = idx; + return detail::arr_to_tup(index); +} + +/** + * @brief Create a raft::mdspan + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @tparam is_host_accessible whether the data is accessible on host + * @tparam is_device_accessible whether the data is accessible on device + * @param ptr Pointer to the data + * @param exts dimensionality of the array (series of integers) + * @return raft::mdspan + */ +template +auto make_mdspan(ElementType* ptr, extents exts) +{ + using accessor_type = + detail::host_device_accessor, + is_host_accessible, + is_device_accessible>; + + return mdspan{ptr, exts}; +} + +/** + * @brief Create raft::extents to specify dimensionality + * + * @tparam IndexType The type of each dimension of the extents + * @tparam Extents Dimensions (a series of integers) + * @param exts The desired dimensions + * @return raft::extents + */ +template > +auto make_extents(Extents... exts) +{ + return extents{exts...}; +} + +/** + * @brief Flatten raft::mdspan into a 1-dim array view + * + * @tparam mdspan_type Expected type raft::host_mdspan or raft::device_mdspan + * @param mds raft::host_mdspan or raft::device_mdspan object + * @return raft::host_mdspan or raft::device_mdspan with vector_extent + * depending on AccessoryPolicy + */ +template > +auto flatten(mdspan_type mds) +{ + RAFT_EXPECTS(mds.is_exhaustive(), "Input must be contiguous."); + + vector_extent ext{mds.size()}; + + return std::experimental::mdspan(mds.data_handle(), ext); +} + +/** + * @brief Reshape raft::host_mdspan or raft::device_mdspan + * + * @tparam mdspan_type Expected type raft::host_mdspan or raft::device_mdspan + * @tparam IndexType the index type of the extents + * @tparam Extents raft::extents for dimensions + * @param mds raft::host_mdspan or raft::device_mdspan object + * @param new_shape Desired new shape of the input + * @return raft::host_mdspan or raft::device_mdspan, depending on AccessorPolicy + */ +template > +auto reshape(mdspan_type mds, extents new_shape) +{ + RAFT_EXPECTS(mds.is_exhaustive(), "Input must be contiguous."); + + size_t new_size = 1; + for (size_t i = 0; i < new_shape.rank(); ++i) { + new_size *= new_shape.extent(i); + } + RAFT_EXPECTS(new_size == mds.size(), "Cannot reshape array with size mismatch"); + + return std::experimental::mdspan(mds.data_handle(), + new_shape); +} + +/** + * \brief Turns linear index into coordinate. Similar to numpy unravel_index. + * + * \code + * auto m = make_host_matrix(7, 6); + * auto m_v = m.view(); + * auto coord = unravel_index(2, m.extents(), typename decltype(m)::layout_type{}); + * std::apply(m_v, coord) = 2; + * \endcode + * + * \param idx The linear index. + * \param shape The shape of the array to use. + * \param layout Must be `layout_c_contiguous` (row-major) in current implementation. + * + * \return A std::tuple that represents the coordinate. + */ +template +RAFT_INLINE_FUNCTION auto unravel_index(Idx idx, + extents shape, + LayoutPolicy const& layout) +{ + static_assert(std::is_same_v>, + layout_c_contiguous>, + "Only C layout is supported."); + static_assert(std::is_integral_v, "Index must be integral."); + auto constexpr kIs64 = sizeof(std::remove_cv_t>) == sizeof(uint64_t); + if (kIs64 && static_cast(idx) > std::numeric_limits::max()) { + return unravel_index_impl(static_cast(idx), shape); + } else { + return unravel_index_impl(static_cast(idx), shape); + } +} +} // namespace raft diff --git a/cpp/include/raft/core/mdspan_types.hpp b/cpp/include/raft/core/mdspan_types.hpp new file mode 100644 index 0000000000..bc2ba314a3 --- /dev/null +++ b/cpp/include/raft/core/mdspan_types.hpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { + +using std::experimental::dynamic_extent; +using std::experimental::extents; + +/** + * @defgroup C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory. + * @{ + */ +using std::experimental::layout_right; +using layout_c_contiguous = layout_right; +using row_major = layout_right; +/** @} */ + +/** + * @defgroup F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory. + * @{ + */ +using std::experimental::layout_left; +using layout_f_contiguous = layout_left; +using col_major = layout_left; +/** @} */ + +template +using vector_extent = std::experimental::extents; + +template +using matrix_extent = std::experimental::extents; + +template +using scalar_extent = std::experimental::extents; + +/** + * @brief Strided layout for non-contiguous memory. + */ +using std::experimental::layout_stride; + +template +using extent_1d = vector_extent; + +template +using extent_2d = matrix_extent; + +template +using extent_3d = + std::experimental::extents; + +template +using extent_4d = std::experimental:: + extents; + +template +using extent_5d = std::experimental::extents; + +} // namespace raft diff --git a/cpp/include/raft/core/nvtx.hpp b/cpp/include/raft/core/nvtx.hpp index eb536b0e01..3dbe1dd511 100644 --- a/cpp/include/raft/core/nvtx.hpp +++ b/cpp/include/raft/core/nvtx.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include /** * \section Usage diff --git a/cpp/include/raft/core/span.hpp b/cpp/include/raft/core/span.hpp index 96950e979e..188d58c896 100644 --- a/cpp/include/raft/core/span.hpp +++ b/cpp/include/raft/core/span.hpp @@ -18,10 +18,16 @@ #include #include // size_t #include // std::byte -#include +#include + +#include +#include + +// TODO (cjnolet): Remove thrust dependencies here so host_span can be used without CUDA Toolkit +// being installed. Reference: https://github.com/rapidsai/raft/issues/812. #include #include -#include // __host__ __device__ +#include // _RAFT_HOST_DEVICE #include #include @@ -108,22 +114,22 @@ class span { constexpr auto cend() const noexcept -> const_iterator { return data() + size(); } - __host__ __device__ constexpr auto rbegin() const noexcept -> reverse_iterator + _RAFT_HOST_DEVICE constexpr auto rbegin() const noexcept -> reverse_iterator { return reverse_iterator{end()}; } - __host__ __device__ constexpr auto rend() const noexcept -> reverse_iterator + _RAFT_HOST_DEVICE constexpr auto rend() const noexcept -> reverse_iterator { return reverse_iterator{begin()}; } - __host__ __device__ constexpr auto crbegin() const noexcept -> const_reverse_iterator + _RAFT_HOST_DEVICE constexpr auto crbegin() const noexcept -> const_reverse_iterator { return const_reverse_iterator{cend()}; } - __host__ __device__ constexpr auto crend() const noexcept -> const_reverse_iterator + _RAFT_HOST_DEVICE constexpr auto crend() const noexcept -> const_reverse_iterator { return const_reverse_iterator{cbegin()}; } @@ -201,18 +207,6 @@ class span { detail::span_storage storage_; }; -/** - * @brief A span class for host pointer. - */ -template -using host_span = span; - -/** - * @brief A span class for device pointer. - */ -template -using device_span = span; - template constexpr auto operator==(span l, span r) -> bool { diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index 2f0d417f90..6ce414aceb 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,782 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -#include -#include - -#include - -#ifndef ENABLE_MEMCPY_ASYNC -// enable memcpy_async interface by default for newer GPUs -#if __CUDA_ARCH__ >= 800 -#define ENABLE_MEMCPY_ASYNC 1 -#endif -#else // ENABLE_MEMCPY_ASYNC -// disable memcpy_async for all older GPUs -#if __CUDA_ARCH__ < 800 -#define ENABLE_MEMCPY_ASYNC 0 -#endif -#endif // ENABLE_MEMCPY_ASYNC - -namespace raft { - -/** helper macro for device inlined functions */ -#define DI inline __device__ -#define HDI inline __host__ __device__ -#define HD __host__ __device__ - -/** - * @brief Provide a ceiling division operation ie. ceil(a / b) - * @tparam IntType supposed to be only integers for now! - */ -template -constexpr HDI IntType ceildiv(IntType a, IntType b) -{ - return (a + b - 1) / b; -} - -/** - * @brief Provide an alignment function ie. ceil(a / b) * b - * @tparam IntType supposed to be only integers for now! - */ -template -constexpr HDI IntType alignTo(IntType a, IntType b) -{ - return ceildiv(a, b) * b; -} - -/** - * @brief Provide an alignment function ie. (a / b) * b - * @tparam IntType supposed to be only integers for now! - */ -template -constexpr HDI IntType alignDown(IntType a, IntType b) -{ - return (a / b) * b; -} - -/** - * @brief Check if the input is a power of 2 - * @tparam IntType data type (checked only for integers) - */ -template -constexpr HDI bool isPo2(IntType num) -{ - return (num && !(num & (num - 1))); -} - -/** - * @brief Give logarithm of the number to base-2 - * @tparam IntType data type (checked only for integers) - */ -template -constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) -{ - return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret); -} - -/** Device function to apply the input lambda across threads in the grid */ -template -DI void forEach(int num, L lambda) -{ - int idx = (blockDim.x * blockIdx.x) + threadIdx.x; - const int numThreads = blockDim.x * gridDim.x; -#pragma unroll - for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) { - if (idx < num) lambda(idx, itr); - } -} - -/** number of threads per warp */ -static const int WarpSize = 32; - -/** get the laneId of the current thread */ -DI int laneId() -{ - int id; - asm("mov.s32 %0, %%laneid;" : "=r"(id)); - return id; -} - -/** - * @brief Swap two values - * @tparam T the datatype of the values - * @param a first input - * @param b second input - */ -template -HDI void swapVals(T& a, T& b) -{ - T tmp = a; - a = b; - b = tmp; -} - -/** Device function to have atomic add support for older archs */ -template -DI void myAtomicAdd(Type* address, Type val) -{ - atomicAdd(address, val); -} - -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) -// Ref: -// http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf -template <> -DI void myAtomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = - atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); - } while (assumed != old); -} -#endif - -template -DI void myAtomicReduce(T* address, T val, ReduceLambda op); - -template -DI void myAtomicReduce(double* address, double val, ReduceLambda op) -{ - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS( - address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed)))); - } while (assumed != old); -} - -template -DI void myAtomicReduce(float* address, float val, ReduceLambda op) -{ - unsigned int* address_as_uint = (unsigned int*)address; - unsigned int old = *address_as_uint, assumed; - do { - assumed = old; - old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed)))); - } while (assumed != old); -} - -template -DI void myAtomicReduce(int* address, int val, ReduceLambda op) -{ - int old = *address, assumed; - do { - assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); - } while (assumed != old); -} - -template -DI void myAtomicReduce(long long* address, long long val, ReduceLambda op) -{ - long long old = *address, assumed; - do { - assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); - } while (assumed != old); -} - -template -DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op) -{ - unsigned long long old = *address, assumed; - do { - assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); - } while (assumed != old); -} - -/** - * @brief Provide atomic min operation. - * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, - * val) - * @param[in] val: new value to compare with old - */ -template -DI T myAtomicMin(T* address, T val); - -/** - * @brief Provide atomic max operation. - * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, - * val) - * @param[in] val: new value to compare with old - */ -template -DI T myAtomicMax(T* address, T val); - -DI float myAtomicMin(float* address, float val) -{ - myAtomicReduce(address, val, fminf); - return *address; -} - -DI float myAtomicMax(float* address, float val) -{ - myAtomicReduce(address, val, fmaxf); - return *address; -} - -DI double myAtomicMin(double* address, double val) -{ - myAtomicReduce(address, val, fmin); - return *address; -} - -DI double myAtomicMax(double* address, double val) -{ - myAtomicReduce(address, val, fmax); - return *address; -} - -/** - * @defgroup Max maximum of two numbers - * @{ - */ -template -HDI T myMax(T x, T y); -template <> -HDI float myMax(float x, float y) -{ - return fmaxf(x, y); -} -template <> -HDI double myMax(double x, double y) -{ - return fmax(x, y); -} -/** @} */ - -/** - * @defgroup Min minimum of two numbers - * @{ - */ -template -HDI T myMin(T x, T y); -template <> -HDI float myMin(float x, float y) -{ - return fminf(x, y); -} -template <> -HDI double myMin(double x, double y) -{ - return fmin(x, y); -} -/** @} */ - -/** - * @brief Provide atomic min operation. - * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, - * val) - * @param[in] val: new value to compare with old - */ -template -DI T myAtomicMin(T* address, T val) -{ - myAtomicReduce(address, val, myMin); - return *address; -} - -/** - * @brief Provide atomic max operation. - * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, - * val) - * @param[in] val: new value to compare with old - */ -template -DI T myAtomicMax(T* address, T val) -{ - myAtomicReduce(address, val, myMax); - return *address; -} - -/** - * Sign function - */ -template -HDI int sgn(const T val) -{ - return (T(0) < val) - (val < T(0)); -} - -/** - * @defgroup Exp Exponential function - * @{ - */ -template -HDI T myExp(T x); -template <> -HDI float myExp(float x) -{ - return expf(x); -} -template <> -HDI double myExp(double x) -{ - return exp(x); -} -/** @} */ - -/** - * @defgroup Cuda infinity values - * @{ - */ -template -inline __device__ T myInf(); -template <> -inline __device__ float myInf() -{ - return CUDART_INF_F; -} -template <> -inline __device__ double myInf() -{ - return CUDART_INF; -} -/** @} */ - -/** - * @defgroup Log Natural logarithm - * @{ - */ -template -HDI T myLog(T x); -template <> -HDI float myLog(float x) -{ - return logf(x); -} -template <> -HDI double myLog(double x) -{ - return log(x); -} -/** @} */ - -/** - * @defgroup Sqrt Square root - * @{ - */ -template -HDI T mySqrt(T x); -template <> -HDI float mySqrt(float x) -{ - return sqrtf(x); -} -template <> -HDI double mySqrt(double x) -{ - return sqrt(x); -} -/** @} */ - -/** - * @defgroup SineCosine Sine and cosine calculation - * @{ - */ -template -DI void mySinCos(T x, T& s, T& c); -template <> -DI void mySinCos(float x, float& s, float& c) -{ - sincosf(x, &s, &c); -} -template <> -DI void mySinCos(double x, double& s, double& c) -{ - sincos(x, &s, &c); -} -/** @} */ - -/** - * @defgroup Sine Sine calculation - * @{ - */ -template -DI T mySin(T x); -template <> -DI float mySin(float x) -{ - return sinf(x); -} -template <> -DI double mySin(double x) -{ - return sin(x); -} -/** @} */ - -/** - * @defgroup Abs Absolute value - * @{ - */ -template -DI T myAbs(T x) -{ - return x < 0 ? -x : x; -} -template <> -DI float myAbs(float x) -{ - return fabsf(x); -} -template <> -DI double myAbs(double x) -{ - return fabs(x); -} -/** @} */ - -/** - * @defgroup Pow Power function - * @{ - */ -template -HDI T myPow(T x, T power); -template <> -HDI float myPow(float x, float power) -{ - return powf(x, power); -} -template <> -HDI double myPow(double x, double power) -{ - return pow(x, power); -} -/** @} */ - -/** - * @defgroup myTanh tanh function - * @{ - */ -template -HDI T myTanh(T x); -template <> -HDI float myTanh(float x) -{ - return tanhf(x); -} -template <> -HDI double myTanh(double x) -{ - return tanh(x); -} -/** @} */ - -/** - * @defgroup myATanh arctanh function - * @{ - */ -template -HDI T myATanh(T x); -template <> -HDI float myATanh(float x) -{ - return atanhf(x); -} -template <> -HDI double myATanh(double x) -{ - return atanh(x); -} -/** @} */ - /** - * @defgroup LambdaOps Lambda operations in reduction kernels - * @{ + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -// IdxType mostly to be used for MainLambda in *Reduction kernels -template -struct Nop { - HDI Type operator()(Type in, IdxType i = 0) { return in; } -}; - -template -struct L1Op { - HDI Type operator()(Type in, IdxType i = 0) { return myAbs(in); } -}; - -template -struct L2Op { - HDI Type operator()(Type in, IdxType i = 0) { return in * in; } -}; - -template -struct Sum { - HDI Type operator()(Type a, Type b) { return a + b; } -}; -/** @} */ /** - * @defgroup Sign Obtain sign value - * @brief Obtain sign of x - * @param x input - * @return +1 if x >= 0 and -1 otherwise - * @{ + * DISCLAIMER: this file is deprecated: use lap.cuh instead */ -template -DI T signPrim(T x) -{ - return x < 0 ? -1 : +1; -} -template <> -DI float signPrim(float x) -{ - return signbit(x) == true ? -1.0f : +1.0f; -} -template <> -DI double signPrim(double x) -{ - return signbit(x) == true ? -1.0 : +1.0; -} -/** @} */ -/** - * @defgroup Max maximum of two numbers - * @brief Obtain maximum of two values - * @param x one item - * @param y second item - * @return maximum of two items - * @{ - */ -template -DI T maxPrim(T x, T y) -{ - return x > y ? x : y; -} -template <> -DI float maxPrim(float x, float y) -{ - return fmaxf(x, y); -} -template <> -DI double maxPrim(double x, double y) -{ - return fmax(x, y); -} -/** @} */ - -/** apply a warp-wide fence (useful from Volta+ archs) */ -DI void warpFence() -{ -#if __CUDA_ARCH__ >= 700 - __syncwarp(); -#endif -} - -/** warp-wide any boolean aggregator */ -DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) -{ -#if CUDART_VERSION >= 9000 - inFlag = __any_sync(mask, inFlag); -#else - inFlag = __any(inFlag); -#endif - return inFlag; -} - -/** warp-wide all boolean aggregator */ -DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) -{ -#if CUDART_VERSION >= 9000 - inFlag = __all_sync(mask, inFlag); -#else - inFlag = __all(inFlag); -#endif - return inFlag; -} - -/** - * @brief Shuffle the data inside a warp - * @tparam T the data type (currently assumed to be 4B) - * @param val value to be shuffled - * @param srcLane lane from where to shuffle - * @param width lane width - * @param mask mask of participating threads (Volta+) - * @return the shuffled data - */ -template -DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu) -{ -#if CUDART_VERSION >= 9000 - return __shfl_sync(mask, val, srcLane, width); -#else - return __shfl(val, srcLane, width); -#endif -} - -/** - * @brief Shuffle the data inside a warp from lower lane IDs - * @tparam T the data type (currently assumed to be 4B) - * @param val value to be shuffled - * @param delta lower lane ID delta from where to shuffle - * @param width lane width - * @param mask mask of participating threads (Volta+) - * @return the shuffled data - */ -template -DI T shfl_up(T val, int delta, int width = WarpSize, uint32_t mask = 0xffffffffu) -{ -#if CUDART_VERSION >= 9000 - return __shfl_up_sync(mask, val, delta, width); -#else - return __shfl_up(val, delta, width); -#endif -} - -/** - * @brief Shuffle the data inside a warp - * @tparam T the data type (currently assumed to be 4B) - * @param val value to be shuffled - * @param laneMask mask to be applied in order to perform xor shuffle - * @param width lane width - * @param mask mask of participating threads (Volta+) - * @return the shuffled data - */ -template -DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu) -{ -#if CUDART_VERSION >= 9000 - return __shfl_xor_sync(mask, val, laneMask, width); -#else - return __shfl_xor(val, laneMask, width); -#endif -} - -/** - * @brief Four-way byte dot product-accumulate. - * @tparam T Four-byte integer: int or unsigned int - * @tparam S Either same as T or a 4-byte vector of the same signedness. - * - * @param a - * @param b - * @param c - * @return dot(a, b) + c - */ -template -DI auto dp4a(S a, S b, T c) -> T; - -template <> -DI auto dp4a(char4 a, char4 b, int c) -> int -{ -#if __CUDA_ARCH__ >= 610 - return __dp4a(a, b, c); -#else - c += static_cast(a.x) * static_cast(b.x); - c += static_cast(a.y) * static_cast(b.y); - c += static_cast(a.z) * static_cast(b.z); - c += static_cast(a.w) * static_cast(b.w); - return c; -#endif -} - -template <> -DI auto dp4a(uchar4 a, uchar4 b, unsigned int c) -> unsigned int -{ -#if __CUDA_ARCH__ >= 610 - return __dp4a(a, b, c); -#else - c += static_cast(a.x) * static_cast(b.x); - c += static_cast(a.y) * static_cast(b.y); - c += static_cast(a.z) * static_cast(b.z); - c += static_cast(a.w) * static_cast(b.w); - return c; -#endif -} - -template <> -DI auto dp4a(int a, int b, int c) -> int -{ -#if __CUDA_ARCH__ >= 610 - return __dp4a(a, b, c); -#else - return dp4a(*reinterpret_cast(&a), *reinterpret_cast(&b), c); -#endif -} - -template <> -DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int -{ -#if __CUDA_ARCH__ >= 610 - return __dp4a(a, b, c); -#else - return dp4a(*reinterpret_cast(&a), *reinterpret_cast(&b), c); -#endif -} - -/** - * @brief Warp-level sum reduction - * @param val input value - * @tparam T Value type to be reduced - * @return Reduction result. All lanes will have the valid result. - * @note Why not cub? Because cub doesn't seem to allow working with arbitrary - * number of warps in a block. All threads in the warp must enter this - * function together - * @todo Expand this to support arbitrary reduction ops - */ -template -DI T warpReduce(T val) -{ -#pragma unroll - for (int i = WarpSize / 2; i > 0; i >>= 1) { - T tmp = shfl_xor(val, i); - val += tmp; - } - return val; -} - -/** - * @brief 1-D block-level sum reduction - * @param val input value - * @param smem shared memory region needed for storing intermediate results. It - * must alteast be of size: `sizeof(T) * nWarps` - * @return only the thread0 will contain valid reduced result - * @note Why not cub? Because cub doesn't seem to allow working with arbitrary - * number of warps in a block. All threads in the block must enter this - * function together - * @todo Expand this to support arbitrary reduction ops - */ -template -DI T blockReduce(T val, char* smem) -{ - auto* sTemp = reinterpret_cast(smem); - int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; - int lid = laneId(); - int wid = threadIdx.x / WarpSize; - val = warpReduce(val); - if (lid == 0) sTemp[wid] = val; - __syncthreads(); - val = lid < nWarps ? sTemp[lid] : T(0); - return warpReduce(val); -} +#pragma once -/** - * @brief Simple utility function to determine whether user_stream or one of the - * internal streams should be used. - * @param user_stream main user stream - * @param int_streams array of internal streams - * @param n_int_streams number of internal streams - * @param idx the index for which to query the stream - */ -inline cudaStream_t select_stream(cudaStream_t user_stream, - cudaStream_t* int_streams, - int n_int_streams, - int idx) -{ - return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream; -} +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/util version instead.") -} // namespace raft +#include diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index b4549e11c9..591f41629d 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -16,8 +16,8 @@ /** * This file is deprecated and will be removed in release 22.06. - * Please use core/cudart_utils.hpp instead. + * Please use util/cudart_utils.hpp instead. */ #pragma once -#include +#include diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh index 28f7516688..a8bfc4d778 100644 --- a/cpp/include/raft/device_atomics.cuh +++ b/cpp/include/raft/device_atomics.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,656 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -/** - * @brief overloads for CUDA atomic operations - * @file device_atomics.cuh - * - * Provides the overloads for arithmetic data types, where CUDA atomic operations are, `atomicAdd`, - * `atomicMin`, `atomicMax`, and `atomicCAS`. - * `atomicAnd`, `atomicOr`, `atomicXor` are also supported for integer data types. - * Also provides `raft::genericAtomicOperation` which performs atomic operation with the given - * binary operator. - */ - -#include -#include - -namespace raft { - -namespace device_atomics { -namespace detail { - -// ------------------------------------------------------------------------------------------------- -// Binary operators - -/* @brief binary `sum` operator */ -struct DeviceSum { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { - return lhs + rhs; - } -}; - -/* @brief binary `min` operator */ -struct DeviceMin { - template - __device__ T operator()(const T& lhs, const T& rhs) - { - return lhs < rhs ? lhs : rhs; - } -}; - -/* @brief binary `max` operator */ -struct DeviceMax { - template - __device__ T operator()(const T& lhs, const T& rhs) - { - return lhs > rhs ? lhs : rhs; - } -}; - -/* @brief binary `product` operator */ -struct DeviceProduct { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { - return lhs * rhs; - } -}; - -/* @brief binary `and` operator */ -struct DeviceAnd { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { - return (lhs & rhs); - } -}; - -/* @brief binary `or` operator */ -struct DeviceOr { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { - return (lhs | rhs); - } -}; - -/* @brief binary `xor` operator */ -struct DeviceXor { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { - return (lhs ^ rhs); - } -}; - -// FIXME: remove this if C++17 is supported. -// `static_assert` requires a string literal at C++14. -#define errmsg_cast "size mismatch." - -template -__forceinline__ __device__ T_output type_reinterpret(T_input value) -{ - static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size"); - return *(reinterpret_cast(&value)); -} - -// ------------------------------------------------------------------------------------------------- -// the implementation of `genericAtomicOperation` - -template -struct genericAtomicOperationImpl; - -// single byte atomic operation -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) - { - using T_int = unsigned int; - - T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); - T_int shift = ((reinterpret_cast(addr) & 3) * 8); - - T_int old = *address_uint32; - T_int assumed; - - do { - assumed = old; - T target_value = T((old >> shift) & 0xff); - uint8_t updating_value = type_reinterpret(op(target_value, update_value)); - T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); - old = atomicCAS(address_uint32, assumed, new_value); - } while (assumed != old); - - return T((old >> shift) & 0xff); - } -}; - -// 2 bytes atomic operation -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) - { - using T_int = unsigned int; - bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = - reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); - - T_int old = *address_uint32; - T_int assumed; - - do { - assumed = old; - T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - uint16_t updating_value = type_reinterpret(op(target_value, update_value)); - - T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value - : (old & 0xffff) | (T_int(updating_value) << 16); - old = atomicCAS(address_uint32, assumed, new_value); - } while (assumed != old); - - return (is_32_align) ? T(old & 0xffff) : T(old >> 16); - ; - } -}; - -// 4 bytes atomic operation -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) - { - using T_int = unsigned int; - T old_value = *addr; - T assumed{old_value}; - - if constexpr (std::is_same{} && (std::is_same{})) { - if (isnan(update_value)) { return old_value; } - } - - do { - assumed = old_value; - const T new_value = op(old_value, update_value); - - T_int ret = atomicCAS(reinterpret_cast(addr), - type_reinterpret(assumed), - type_reinterpret(new_value)); - old_value = type_reinterpret(ret); - } while (assumed != old_value); - - return old_value; - } -}; - -// 4 bytes fp32 atomic Max operation -template <> -struct genericAtomicOperationImpl { - using T = float; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) - { - if (isnan(update_value)) { return *addr; } - - T old = (update_value >= 0) - ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value))) - : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(update_value))); - - return old; - } -}; - -// 8 bytes atomic operation -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - - T old_value = *addr; - T assumed{old_value}; - - do { - assumed = old_value; - const T new_value = op(old_value, update_value); - - T_int ret = atomicCAS(reinterpret_cast(addr), - type_reinterpret(assumed), - type_reinterpret(new_value)); - old_value = type_reinterpret(ret); - - } while (assumed != old_value); - - return old_value; - } -}; - -// ------------------------------------------------------------------------------------------------- -// specialized functions for operators -// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is -// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int -// `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int - -// CUDA natively supports `unsigned long long int` for `atomicAdd`, -// but doesn't supports `long int`. -// However, since the signed integer is represented as Two's complement, -// the fundamental arithmetic operations of addition are identical to -// those for unsigned binary numbers. -// Then, this computes as `unsigned long long int` with `atomicAdd` -// @sa https://en.wikipedia.org/wiki/Two%27s_complement -template <> -struct genericAtomicOperationImpl { - using T = long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); - return type_reinterpret(ret); - } -}; - -template <> -struct genericAtomicOperationImpl { - using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); - return type_reinterpret(ret); - } -}; - -// CUDA natively supports `unsigned long long int` for `atomicAdd`, -// but doesn't supports `long long int`. -// However, since the signed integer is represented as Two's complement, -// the fundamental arithmetic operations of addition are identical to -// those for unsigned binary numbers. -// Then, this computes as `unsigned long long int` with `atomicAdd` -// @sa https://en.wikipedia.org/wiki/Two%27s_complement -template <> -struct genericAtomicOperationImpl { - using T = long long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); - return type_reinterpret(ret); - } -}; - -template <> -struct genericAtomicOperationImpl { - using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMin(reinterpret_cast(addr), type_reinterpret(update_value)); - return type_reinterpret(ret); - } -}; - -template <> -struct genericAtomicOperationImpl { - using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMax(reinterpret_cast(addr), type_reinterpret(update_value)); - return type_reinterpret(ret); - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAnd(reinterpret_cast(addr), type_reinterpret(update_value)); - return type_reinterpret(ret); - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicOr(reinterpret_cast(addr), type_reinterpret(update_value)); - return type_reinterpret(ret); - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicXor(reinterpret_cast(addr), type_reinterpret(update_value)); - return type_reinterpret(ret); - } -}; - -// ------------------------------------------------------------------------------------------------- -// the implementation of `typesAtomicCASImpl` - -template -struct typesAtomicCASImpl; - -template -struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) - { - using T_int = unsigned int; - - T_int shift = ((reinterpret_cast(addr) & 3) * 8); - T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); - - // the 'target_value' in `old` can be different from `compare` - // because other thread may update the value - // before fetching a value from `address_uint32` in this function - T_int old = *address_uint32; - T_int assumed; - T target_value; - uint8_t u_val = type_reinterpret(update_value); - - do { - assumed = old; - target_value = T((old >> shift) & 0xff); - // have to compare `target_value` and `compare` before calling atomicCAS - // the `target_value` in `old` can be different with `compare` - if (target_value != compare) break; - - T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); - old = atomicCAS(address_uint32, assumed, new_value); - } while (assumed != old); - - return target_value; - } -}; - -template -struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) - { - using T_int = unsigned int; - - bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = - reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); - - T_int old = *address_uint32; - T_int assumed; - T target_value; - uint16_t u_val = type_reinterpret(update_value); - - do { - assumed = old; - target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - if (target_value != compare) break; - - T_int new_value = - (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16); - old = atomicCAS(address_uint32, assumed, new_value); - } while (assumed != old); - - return target_value; - } -}; - -template -struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) - { - using T_int = unsigned int; - - T_int ret = atomicCAS(reinterpret_cast(addr), - type_reinterpret(compare), - type_reinterpret(update_value)); - return type_reinterpret(ret); - } -}; - -// 8 bytes atomic operation -template -struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) - { - using T_int = unsigned long long int; - static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - - T_int ret = atomicCAS(reinterpret_cast(addr), - type_reinterpret(compare), - type_reinterpret(update_value)); - - return type_reinterpret(ret); - } -}; - -} // namespace detail -} // namespace device_atomics - -/** -------------------------------------------------------------------------* - * @brief compute atomic binary operation - * reads the `old` located at the `address` in global or shared memory, - * computes 'BinaryOp'('old', 'update_value'), - * and stores the result back to memory at the same address. - * These three operations are performed in one atomic transaction. - * - * The supported cudf types for `genericAtomicOperation` are: - * int8_t, int16_t, int32_t, int64_t, float, double - * - * @param[in] address The address of old value in global or shared memory - * @param[in] update_value The value to be computed - * @param[in] op The binary operator used for compute - * - * @returns The old value at `address` - * -------------------------------------------------------------------------**/ -template -typename std::enable_if_t::value, T> __forceinline__ __device__ -genericAtomicOperation(T* address, T const& update_value, BinaryOp op) -{ - auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; - return T(fun(address, update_value, op)); -} - -// specialization for bool types -template -__forceinline__ __device__ bool genericAtomicOperation(bool* address, - bool const& update_value, - BinaryOp op) -{ - using T = bool; - // don't use underlying type to apply operation for bool - auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; - return T(fun(address, update_value, op)); -} - -} // namespace raft - /** - * @brief Overloads for `atomicAdd` - * - * reads the `old` located at the `address` in global or shared memory, computes (old + val), and - * stores the result back to memory at the same address. These three operations are performed in one - * atomic transaction. - * - * The supported types for `atomicAdd` are: integers are floating point numbers. - * CUDA natively supports `int`, `unsigned int`, `unsigned long long int`, `float`, `double. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be added - * - * @returns The old value at `address` - */ -template -__forceinline__ __device__ T atomicAdd(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{}); -} - -/** - * @brief Overloads for `atomicMin` - * - * reads the `old` located at the `address` in global or shared memory, computes the minimum of old - * and val, and stores the result back to memory at the same address. These three operations are - * performed in one atomic transaction. - * - * The supported types for `atomicMin` are: integers are floating point numbers. - * CUDA natively supports `int`, `unsigend int`, `unsigned long long int`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -template -__forceinline__ __device__ T atomicMin(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{}); -} /** - * @brief Overloads for `atomicMax` - * - * reads the `old` located at the `address` in global or shared memory, computes the maximum of old - * and val, and stores the result back to memory at the same address. These three operations are - * performed in one atomic transaction. - * - * The supported types for `atomicMax` are: integers are floating point numbers. - * CUDA natively supports `int`, `unsigend int`, `unsigned long long int`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` + * DISCLAIMER: this file is deprecated: use lap.cuh instead */ -template -__forceinline__ __device__ T atomicMax(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{}); -} -/** - * @brief Overloads for `atomicCAS` - * - * reads the `old` located at the `address` in global or shared memory, computes - * (`old` == `compare` ? `val` : `old`), and stores the result back to memory at the same address. - * These three operations are performed in one atomic transaction. - * - * The supported types for `atomicCAS` are: integers are floating point numbers. - * CUDA natively supports `int`, `unsigned int`, `unsigned long long int`, `unsigned short int`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] compare The value to be compared - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template -__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) -{ - return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, val); -} - -/** - * @brief Overloads for `atomicAnd` - * - * reads the `old` located at the `address` in global or shared memory, computes (old & val), and - * stores the result back to memory at the same address. These three operations are performed in - * one atomic transaction. - * - * The supported types for `atomicAnd` are: integers. - * CUDA natively supports `int`, `unsigned int`, `unsigned long long int`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicAnd(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{}); -} - -/** - * @brief Overloads for `atomicOr` - * - * reads the `old` located at the `address` in global or shared memory, computes (old | val), and - * stores the result back to memory at the same address. These three operations are performed in - * one atomic transaction. - * - * The supported types for `atomicOr` are: integers. - * CUDA natively supports `int`, `unsigned int`, `unsigned long long int`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicOr(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{}); -} +#pragma once -/** - * @brief Overloads for `atomicXor` - * - * reads the `old` located at the `address` in global or shared memory, computes (old ^ val), and - * stores the result back to memory at the same address. These three operations are performed in - * one atomic transaction. - * - * The supported types for `atomicXor` are: integers. - * CUDA natively supports `int`, `unsigned int`, `unsigned long long int`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicXor(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{}); -} +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/util version instead.") -/** - * @brief: Warp aggregated atomic increment - * - * increments an atomic counter using all active threads in a warp. The return - * value is the original value of the counter plus the rank of the calling - * thread. - * - * The use of atomicIncWarp is a performance optimization. It can reduce the - * amount of atomic memory traffic by a factor of 32. - * - * Adapted from: - * https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ - * - * @tparam T An integral type - * @param[in,out] ctr The address of old value - * - * @return The old value of the counter plus the rank of the calling thread. - */ -template ::value, T>* = nullptr> -__device__ T atomicIncWarp(T* ctr) -{ - namespace cg = cooperative_groups; - auto g = cg::coalesced_threads(); - T warp_res; - if (g.thread_rank() == 0) { warp_res = atomicAdd(ctr, static_cast(g.size())); } - return g.shfl(warp_res, 0) + g.thread_rank(); -} +#include diff --git a/cpp/include/raft/device_utils.cuh b/cpp/include/raft/device_utils.cuh index d89a484109..5e6cf47c7d 100644 --- a/cpp/include/raft/device_utils.cuh +++ b/cpp/include/raft/device_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,96 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -#include -#include // pair - -namespace raft { - -// TODO move to raft https://github.com/rapidsai/raft/issues/90 -/** helper method to get the compute capability version numbers */ -inline std::pair getDeviceCapability() -{ - int devId; - RAFT_CUDA_TRY(cudaGetDevice(&devId)); - int major, minor; - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId)); - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId)); - return std::make_pair(major, minor); -} - /** - * @brief Batched warp-level sum reduction - * - * @tparam T data type - * @tparam NThreads Number of threads in the warp doing independent reductions - * - * @param[in] val input value - * @return for the first "group" of threads, the reduced value. All - * others will contain unusable values! - * - * @note Why not cub? Because cub doesn't seem to allow working with arbitrary - * number of warps in a block and also doesn't support this kind of - * batched reduction operation - * @note All threads in the warp must enter this function together - * - * @todo Expand this to support arbitrary reduction ops + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -template -DI T batchedWarpReduce(T val) -{ -#pragma unroll - for (int i = NThreads; i < raft::WarpSize; i <<= 1) { - val += raft::shfl(val, raft::laneId() + i); - } - return val; -} /** - * @brief 1-D block-level batched sum reduction - * - * @tparam T data type - * @tparam NThreads Number of threads in the warp doing independent reductions - * - * @param val input value - * @param smem shared memory region needed for storing intermediate results. It - * must alteast be of size: `sizeof(T) * nWarps * NThreads` - * @return for the first "group" of threads in the block, the reduced value. - * All others will contain unusable values! - * - * @note Why not cub? Because cub doesn't seem to allow working with arbitrary - * number of warps in a block and also doesn't support this kind of - * batched reduction operation - * @note All threads in the block must enter this function together - * - * @todo Expand this to support arbitrary reduction ops + * DISCLAIMER: this file is deprecated: use lap.cuh instead */ -template -DI T batchedBlockReduce(T val, char* smem) -{ - auto* sTemp = reinterpret_cast(smem); - constexpr int nGroupsPerWarp = raft::WarpSize / NThreads; - static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!"); - const int nGroups = (blockDim.x + NThreads - 1) / NThreads; - const int lid = raft::laneId(); - const int lgid = lid % NThreads; - const int gid = threadIdx.x / NThreads; - const auto wrIdx = (gid / nGroupsPerWarp) * NThreads + lgid; - const auto rdIdx = gid * NThreads + lgid; - for (int i = nGroups; i > 0;) { - auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp; - if (gid < iAligned) { - val = batchedWarpReduce(val); - if (lid < NThreads) sTemp[wrIdx] = val; - } - __syncthreads(); - i /= nGroupsPerWarp; - if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); } - __syncthreads(); - } - return val; -} -} // namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/util version instead.") + +#include diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh index c88d5afeab..2b77d280fe 100644 --- a/cpp/include/raft/distance/detail/correlation.cuh +++ b/cpp/include/raft/distance/detail/correlation.cuh @@ -15,9 +15,9 @@ */ #pragma once -#include #include #include +#include namespace raft { namespace distance { diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh index 4782afe46e..fa0c7a48cc 100644 --- a/cpp/include/raft/distance/detail/distance.cuh +++ b/cpp/include/raft/distance/detail/distance.cuh @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include #include @@ -30,7 +29,8 @@ #include #include #include -#include +#include +#include #include namespace raft { diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh index 81d02c410c..8aae7d40f4 100644 --- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh @@ -18,9 +18,9 @@ #include #include -#include #include #include +#include #include namespace raft { @@ -51,9 +51,15 @@ struct MinAndDistanceReduceOpImpl { } } + DI void operator()(LabelT rid, DataT* out, const KVP& other) + { + if (other.value < *out) { *out = other.value; } + } + + DI void init(DataT* out, DataT maxVal) { *out = maxVal; } DI void init(KVP* out, DataT maxVal) { - out->key = -1; + out->key = 0; out->value = maxVal; } }; @@ -92,14 +98,14 @@ DI void updateReducedVal( const auto lid = threadIdx.x % raft::WarpSize; const auto accrowid = threadIdx.x / P::AccThCols; - // for now have first lane from each warp update a unique output row. This - // will resolve hang issues with pre-Volta architectures + // Update each output row in order within a warp. This will resolve hang + // issues with pre-Volta architectures #pragma unroll for (int j = 0; j < (raft::WarpSize / P::AccThCols); j++) { - if (lid == 0) { + if (lid == j * P::AccThCols) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto rid = gridStrideY + accrowid + j + i * P::AccThRows; + auto rid = gridStrideY + accrowid + i * P::AccThRows; if (rid < m) { auto value = val[i]; while (atomicCAS(mutex + rid, 0, 1) == 1) @@ -111,14 +117,6 @@ DI void updateReducedVal( } } } - if (j < (raft::WarpSize / P::AccThCols) - 1) { -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); - auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols); - val[i] = {tmpkey, tmpvalue}; - } - } } } @@ -152,7 +150,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, KVPair val[P::AccRowsPerTh]; #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; + val[i] = {0, maxVal}; } // epilogue operation lambda for final value calculation @@ -210,8 +208,10 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll for (int j = P::AccThCols / 2; j > 0; j >>= 1) { - auto tmpkey = raft::shfl(val[i].key, lid + j); - auto tmpvalue = raft::shfl(val[i].value, lid + j); + // Actually, the srcLane (lid +j) should be (lid +j) % P:AccThCols, + // but the shfl op applies the modulo internally. + auto tmpkey = raft::shfl(val[i].key, lid + j, P::AccThCols); + auto tmpvalue = raft::shfl(val[i].value, lid + j, P::AccThCols); KVPair tmp = {tmpkey, tmpvalue}; val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } @@ -222,7 +222,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, // reset the val array. #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; + val[i] = {0, maxVal}; } }; @@ -261,7 +261,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, template void fusedL2NNImpl(OutT* min, @@ -279,7 +279,8 @@ void fusedL2NNImpl(OutT* min, bool initOutBuffer, cudaStream_t stream) { - typedef typename linalg::Policy4x4::Policy P; + // The kernel policy is determined by fusedL2NN. + typedef Policy P; dim3 blk(P::Nthreads); auto nblks = raft::ceildiv(m, P::Nthreads); diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh index 9d203c0c4f..27e9935358 100644 --- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh @@ -14,11 +14,11 @@ * limitations under the License. */ #pragma once -#include -#include #include #include -#include +#include +#include +#include #include diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 3db1749bb4..6e3f97b45c 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -18,12 +18,12 @@ #pragma once +#include #include -#include -#include +#include #include -#include +#include /** * @defgroup pairwise_distance pairwise distance prims diff --git a/cpp/include/raft/distance/distance_type.hpp b/cpp/include/raft/distance/distance_type.hpp index f75263b00d..f6eb4614f9 100644 --- a/cpp/include/raft/distance/distance_type.hpp +++ b/cpp/include/raft/distance/distance_type.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,57 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +/** + * This file is deprecated and will be removed at some point in a future release. + * Please use `raft/distance/distance_types.hpp` instead. + */ #pragma once -namespace raft { -namespace distance { - -/** enum to tell how to compute distance */ -enum DistanceType : unsigned short { +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use distance_types.hpp instead.") - /** evaluate as dist_ij = sum(x_ik^2) + sum(y_ij)^2 - 2*sum(x_ik * y_jk) */ - L2Expanded = 0, - /** same as above, but inside the epilogue, perform square root operation */ - L2SqrtExpanded = 1, - /** cosine distance */ - CosineExpanded = 2, - /** L1 distance */ - L1 = 3, - /** evaluate as dist_ij += (x_ik - y-jk)^2 */ - L2Unexpanded = 4, - /** same as above, but inside the epilogue, perform square root operation */ - L2SqrtUnexpanded = 5, - /** basic inner product **/ - InnerProduct = 6, - /** Chebyshev (Linf) distance **/ - Linf = 7, - /** Canberra distance **/ - Canberra = 8, - /** Generalized Minkowski distance **/ - LpUnexpanded = 9, - /** Correlation distance **/ - CorrelationExpanded = 10, - /** Jaccard distance **/ - JaccardExpanded = 11, - /** Hellinger distance **/ - HellingerExpanded = 12, - /** Haversine distance **/ - Haversine = 13, - /** Bray-Curtis distance **/ - BrayCurtis = 14, - /** Jensen-Shannon distance**/ - JensenShannon = 15, - /** Hamming distance **/ - HammingUnexpanded = 16, - /** KLDivergence **/ - KLDivergence = 17, - /** RusselRao **/ - RusselRaoExpanded = 18, - /** Dice-Sorensen distance **/ - DiceExpanded = 19, - /** Precomputed (special value) **/ - Precomputed = 100 -}; -}; // namespace distance -}; // end namespace raft +#include \ No newline at end of file diff --git a/cpp/include/raft/distance/distance_types.hpp b/cpp/include/raft/distance/distance_types.hpp new file mode 100644 index 0000000000..f75263b00d --- /dev/null +++ b/cpp/include/raft/distance/distance_types.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft { +namespace distance { + +/** enum to tell how to compute distance */ +enum DistanceType : unsigned short { + + /** evaluate as dist_ij = sum(x_ik^2) + sum(y_ij)^2 - 2*sum(x_ik * y_jk) */ + L2Expanded = 0, + /** same as above, but inside the epilogue, perform square root operation */ + L2SqrtExpanded = 1, + /** cosine distance */ + CosineExpanded = 2, + /** L1 distance */ + L1 = 3, + /** evaluate as dist_ij += (x_ik - y-jk)^2 */ + L2Unexpanded = 4, + /** same as above, but inside the epilogue, perform square root operation */ + L2SqrtUnexpanded = 5, + /** basic inner product **/ + InnerProduct = 6, + /** Chebyshev (Linf) distance **/ + Linf = 7, + /** Canberra distance **/ + Canberra = 8, + /** Generalized Minkowski distance **/ + LpUnexpanded = 9, + /** Correlation distance **/ + CorrelationExpanded = 10, + /** Jaccard distance **/ + JaccardExpanded = 11, + /** Hellinger distance **/ + HellingerExpanded = 12, + /** Haversine distance **/ + Haversine = 13, + /** Bray-Curtis distance **/ + BrayCurtis = 14, + /** Jensen-Shannon distance**/ + JensenShannon = 15, + /** Hamming distance **/ + HammingUnexpanded = 16, + /** KLDivergence **/ + KLDivergence = 17, + /** RusselRao **/ + RusselRaoExpanded = 18, + /** Dice-Sorensen distance **/ + DiceExpanded = 19, + /** Precomputed (special value) **/ + Precomputed = 100 +}; +}; // namespace distance +}; // end namespace raft diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index ac8895c9ce..2915bce360 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -21,10 +21,12 @@ #include #include -#include +#include #include -#include +#include +#include #include +#include namespace raft { namespace distance { @@ -99,20 +101,114 @@ void fusedL2NN(OutT* min, bool initOutBuffer, cudaStream_t stream) { + // When k is smaller than 32, the Policy4x4 results in redundant calculations + // as it uses tiles that have k=32. Therefore, use a "skinny" policy instead + // that uses tiles with a smaller value of k. + bool is_skinny = k < 32; + size_t bytes = sizeof(DataT) * k; if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { - detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + if (is_skinny) { + detail::fusedL2NNImpl::Policy, + ReduceOpT>( + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + } else { + detail::fusedL2NNImpl::Policy, + ReduceOpT>( + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + } } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { - detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + if (is_skinny) { + detail::fusedL2NNImpl::Policy, + ReduceOpT>( + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + } else { + detail::fusedL2NNImpl::Policy, + ReduceOpT>( + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + } } else { - detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + if (is_skinny) { + detail::fusedL2NNImpl::Policy, + ReduceOpT>( + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + } else { + detail::fusedL2NNImpl::Policy, + ReduceOpT>( + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + } } } +/** + * @brief Wrapper around fusedL2NN with minimum reduction operators. + * + * fusedL2NN cannot be compiled in the distance library due to the lambda + * operators, so this wrapper covers the most common case (minimum). + * This should be preferred to the more generic API when possible, in order to + * reduce compilation times for users of the shared library. + * + * @tparam DataT data type + * @tparam OutT output type to either store 1-NN indices and their minimum + * distances (e.g. cub::KeyValuePair) or store only the min distances. + * @tparam IdxT indexing arithmetic type + * @param[out] min will contain the reduced output (Length = `m`) + * (on device) + * @param[in] x first matrix. Row major. Dim = `m x k`. + * (on device). + * @param[in] y second matrix. Row major. Dim = `n x k`. + * (on device). + * @param[in] xn L2 squared norm of `x`. Length = `m`. (on device). + * @param[in] yn L2 squared norm of `y`. Length = `n`. (on device) + * @param[in] m gemm m + * @param[in] n gemm n + * @param[in] k gemm k + * @param[in] workspace temp workspace. Size = sizeof(int)*m. (on device) + * @param[in] sqrt Whether the output `minDist` should contain L2-sqrt + * @param[in] initOutBuffer whether to initialize the output buffer before the + * main kernel launch + * @param[in] stream cuda stream + */ +template +void fusedL2NNMinReduce(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + void* workspace, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream) +{ + MinAndDistanceReduceOp redOp; + KVPMinReduce pairRedOp; + + fusedL2NN( + min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); +} + } // namespace distance } // namespace raft -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp index 768e33b3a7..74ad0974f4 100644 --- a/cpp/include/raft/distance/fused_l2_nn.hpp +++ b/cpp/include/raft/distance/fused_l2_nn.hpp @@ -18,105 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __FUSED_L2_NN_H -#define __FUSED_L2_NN_H - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace raft { -namespace distance { - -template -using KVPMinReduce = detail::KVPMinReduceImpl; - -template -using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl; - -template -using MinReduceOp = detail::MinReduceOpImpl; - /** - * Initialize array using init value from reduction op + * DISCLAIMER: this file is deprecated: use fused_l2_nn.cuh instead */ -template -void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) -{ - detail::initialize(min, m, maxVal, redOp, handle.get_stream()); -} -/** - * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call. - * - * The benefits of such a call are 2-fold: 1) eliminate the need for an - * intermediate buffer to store the output of gemm 2) reduce the memory read - * traffic on this intermediate buffer, otherwise needed during the reduction - * phase for 1-NN. - * - * @tparam DataT data type - * @tparam OutT output type to either store 1-NN indices and their minimum - * distances or store only the min distances. Accordingly, one - * has to pass an appropriate `ReduceOpT` - * @tparam IdxT indexing arithmetic type - * @tparam ReduceOpT A struct to perform the final needed reduction operation - * and also to initialize the output array elements with the - * appropriate initial value needed for reduction. - * - * @param[out] min will contain the reduced output (Length = `m`) - * (on device) - * @param[in] x first matrix. Row major. Dim = `m x k`. - * (on device). - * @param[in] y second matrix. Row major. Dim = `n x k`. - * (on device). - * @param[in] xn L2 squared norm of `x`. Length = `m`. (on device). - * @param[in] yn L2 squared norm of `y`. Length = `n`. (on device) - * @param[in] m gemm m - * @param[in] n gemm n - * @param[in] k gemm k - * @param[in] workspace temp workspace. Size = sizeof(int)*m. (on device) - * @param[in] redOp reduction operator in the epilogue - * @param[in] pairRedOp reduction operation on key value pairs - * @param[in] sqrt Whether the output `minDist` should contain L2-sqrt - * @param[in] initOutBuffer whether to initialize the output buffer before the - * main kernel launch - * @param[in] stream cuda stream - */ -template -void fusedL2NN(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - void* workspace, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - bool sqrt, - bool initOutBuffer, - cudaStream_t stream) -{ - size_t bytes = sizeof(DataT) * k; - if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { - detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); - } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { - detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); - } else { - detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); - } -} +#pragma once -} // namespace distance -} // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "fused_l2_nn.cuh" diff --git a/cpp/include/raft/distance/specializations.hpp b/cpp/include/raft/distance/specializations.hpp index 641968d9f1..04afb73036 100644 --- a/cpp/include/raft/distance/specializations.hpp +++ b/cpp/include/raft/distance/specializations.hpp @@ -18,11 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __DISTANCE_SPECIALIZATIONS_H -#define __DISTANCE_SPECIALIZATIONS_H +/** + * DISCLAIMER: this file is deprecated: use specializations.cuh instead + */ #pragma once -#include +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "specializations.cuh" diff --git a/cpp/include/raft/distance/specializations/detail/russel_rao.cuh b/cpp/include/raft/distance/specializations/detail/russel_rao.cuh new file mode 100644 index 0000000000..f0aa1c27ee --- /dev/null +++ b/cpp/include/raft/distance/specializations/detail/russel_rao.cuh @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace distance { +namespace detail { +extern template void +distance( + const float* x, + const float* y, + float* dist, + int m, + int n, + int k, + void* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor, + float metric_arg); + +extern template void +distance( + const double* x, + const double* y, + double* dist, + int m, + int n, + int k, + void* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor, + double metric_arg); + +extern template void +distance( + const float* x, + const float* y, + float* dist, + std::uint32_t m, + std::uint32_t n, + std::uint32_t k, + void* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor, + float metric_arg); + +} // namespace detail +} // namespace distance +} // namespace raft diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh index 7553f87e39..3b7d08f2aa 100644 --- a/cpp/include/raft/distance/specializations/distance.cuh +++ b/cpp/include/raft/distance/specializations/distance.cuh @@ -30,3 +30,5 @@ #include #include #include +#include +//#include diff --git a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh new file mode 100644 index 0000000000..deddf65b37 --- /dev/null +++ b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace distance { + +extern template void fusedL2NNMinReduce, int>( + cub::KeyValuePair* min, + const float* x, + const float* y, + const float* xn, + const float* yn, + int m, + int n, + int k, + void* workspace, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream); +extern template void fusedL2NNMinReduce, int64_t>( + cub::KeyValuePair* min, + const float* x, + const float* y, + const float* xn, + const float* yn, + int64_t m, + int64_t n, + int64_t k, + void* workspace, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream); +extern template void fusedL2NNMinReduce, int>( + cub::KeyValuePair* min, + const double* x, + const double* y, + const double* xn, + const double* yn, + int m, + int n, + int k, + void* workspace, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream); +extern template void fusedL2NNMinReduce, int64_t>( + cub::KeyValuePair* min, + const double* x, + const double* y, + const double* xn, + const double* yn, + int64_t m, + int64_t n, + int64_t k, + void* workspace, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream); +extern template void fusedL2NNMinReduce(float* min, + const float* x, + const float* y, + const float* xn, + const float* yn, + int m, + int n, + int k, + void* workspace, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream); +extern template void fusedL2NNMinReduce(float* min, + const float* x, + const float* y, + const float* xn, + const float* yn, + int64_t m, + int64_t n, + int64_t k, + void* workspace, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream); +extern template void fusedL2NNMinReduce(double* min, + const double* x, + const double* y, + const double* xn, + const double* yn, + int m, + int n, + int k, + void* workspace, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream); +extern template void fusedL2NNMinReduce(double* min, + const double* x, + const double* y, + const double* xn, + const double* yn, + int64_t m, + int64_t n, + int64_t k, + void* workspace, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream); + +} // namespace distance +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h index a2ce7598c6..8962c3d713 100644 --- a/cpp/include/raft/integer_utils.h +++ b/cpp/include/raft/integer_utils.h @@ -1,6 +1,4 @@ /* - * Copyright 2019 BlazingDB, Inc. - * Copyright 2019 Eyal Rozenberg * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,170 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -/** - * @file Utility code involving integer arithmetic - * - */ - -#include -#include - -namespace raft { -//! Utility functions -/** - * Finds the smallest integer not less than `number_to_round` and modulo `S` is - * zero. This function assumes that `number_to_round` is non-negative and - * `modulus` is positive. - */ -template -inline S round_up_safe(S number_to_round, S modulus) -{ - auto remainder = number_to_round % modulus; - if (remainder == 0) { return number_to_round; } - auto rounded_up = number_to_round - remainder + modulus; - if (rounded_up < number_to_round) { - throw std::invalid_argument("Attempt to round up beyond the type's maximum value"); - } - return rounded_up; -} - -/** - * Finds the largest integer not greater than `number_to_round` and modulo `S` is - * zero. This function assumes that `number_to_round` is non-negative and - * `modulus` is positive. - */ -template -inline S round_down_safe(S number_to_round, S modulus) -{ - auto remainder = number_to_round % modulus; - auto rounded_down = number_to_round - remainder; - return rounded_down; -} - -/** - * Divides the left-hand-side by the right-hand-side, rounding up - * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. - * - * @param dividend the number to divide - * @param divisor the number by which to divide - * @return The least integer multiple of {@link divisor} which is greater than or equal to - * the non-integral division dividend/divisor. - * - * @note sensitive to overflow, i.e. if dividend > std::numeric_limits::max() - divisor, - * the result will be incorrect - */ -template -constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept -{ - return (dividend + divisor - 1) / divisor; -} - -namespace detail { -template -constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, - I divisor) noexcept -{ - // TODO: This could probably be implemented faster - return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) - : (dividend > 0); -} - -template -constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, - I divisor) noexcept -{ - auto quotient = dividend / divisor; - auto remainder = dividend % divisor; - return quotient + (remainder != 0); -} - -} // namespace detail - /** - * Divides the left-hand-side by the right-hand-side, rounding up - * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. - * - * @param dividend the number to divide - * @param divisor the number of by which to divide - * @return The least integer multiple of {@link divisor} which is greater than or equal to - * the non-integral division dividend/divisor. - * - * @note will not overflow, and may _or may not_ be slower than the intuitive - * approach of using (dividend + divisor - 1) / divisor + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -template -constexpr inline std::enable_if_t::value, I> div_rounding_up_safe( - I dividend, I divisor) noexcept -{ - using i_is_a_signed_type = std::integral_constant::value>; - return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); -} - -template -constexpr inline std::enable_if_t::value, bool> is_a_power_of_two( - I val) noexcept -{ - return ((val - 1) & val) == 0; -} /** - * @brief Return the absolute value of a number. - * - * This calls `std::abs()` which performs equivalent: `(value < 0) ? -value : value`. - * - * This was created to prevent compile errors calling `std::abs()` with unsigned integers. - * An example compile error appears as follows: - * @code{.pseudo} - * error: more than one instance of overloaded function "std::abs" matches the argument list: - * function "abs(int)" - * function "std::abs(long)" - * function "std::abs(long long)" - * function "std::abs(double)" - * function "std::abs(float)" - * function "std::abs(long double)" - * argument types are: (uint64_t) - * @endcode - * - * Not all cases could be if-ed out using `std::is_signed::value` and satisfy the compiler. - * - * @param value Numeric value can be either integer or float type. - * @return Absolute value if value type is signed. - */ -template -std::enable_if_t::value, T> constexpr inline absolute_value(T value) -{ - return std::abs(value); -} -// Unsigned type just returns itself. -template -std::enable_if_t::value, T> constexpr inline absolute_value(T value) -{ - return value; -} - -/** - * @defgroup Check whether the numeric conversion is narrowing - * - * @tparam From source type - * @tparam To destination type - * @{ + * DISCLAIMER: this file is deprecated: use lap.cuh instead */ -template -struct is_narrowing : std::true_type { -}; -template -struct is_narrowing()})>> : std::false_type { -}; -/** @} */ +#pragma once -/** Check whether the numeric conversion is narrowing */ -template -inline constexpr bool is_narrowing_v = is_narrowing::value; // NOLINT +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/util version instead.") -} // namespace raft +#include diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp index 189c26f69f..4f47b426c0 100644 --- a/cpp/include/raft/label/classlabels.hpp +++ b/cpp/include/raft/label/classlabels.hpp @@ -13,110 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#ifndef __CLASS_LABELS_H -#define __CLASS_LABELS_H - -#pragma once - -#include - -namespace raft { -namespace label { - /** - * Get unique class labels. - * - * The y array is assumed to store class labels. The unique values are selected - * from this array. - * - * @tparam value_t numeric type of the arrays with class labels - * @param [inout] unique output unique labels - * @param [in] y device array of labels, size [n] - * @param [in] n number of labels - * @param [in] stream cuda stream - * @returns unique device array of unique labels, unallocated on entry, - * on exit it has size + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -template -int getUniquelabels(rmm::device_uvector& unique, value_t* y, size_t n, cudaStream_t stream) -{ - return detail::getUniquelabels(unique, y, n, stream); -} /** - * Assign one versus rest labels. - * - * The output labels will have values +/-1: - * y_out = (y == y_unique[idx]) ? +1 : -1; - * - * The output type currently is set to value_t, but for SVM in principle we are - * free to choose other type for y_out (it should represent +/-1, and it is used - * in floating point arithmetics). - * - * @param [in] y device array if input labels, size [n] - * @param [in] n number of labels - * @param [in] y_unique device array of unique labels, size [n_classes] - * @param [in] n_classes number of unique labels - * @param [out] y_out device array of output labels - * @param [in] idx index of unique label that should be labeled as 1 - * @param [in] stream cuda stream - */ -template -void getOvrlabels( - value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) -{ - detail::getOvrlabels(y, n, y_unique, n_classes, y_out, idx, stream); -} -/** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param[out] out the output monotonic array - * @param[in] in input label array - * @param[in] N number of elements in the input array - * @param[in] stream cuda stream to use - * @param[in] filter_op an optional function for specifying which values - * should have monotonically increasing labels applied to them. - * @param[in] zero_based force monotonic set to start at 0? + * DISCLAIMER: this file is deprecated: use classlabels.cuh instead */ -template -void make_monotonic( - Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false) -{ - detail::make_monotonic(out, in, N, stream, filter_op, zero_based); -} -/** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @param[out] out output label array with labels assigned monotonically - * @param[in] in input label array - * @param[in] N number of elements in the input array - * @param[in] stream cuda stream to use - * @param[in] zero_based force monotonic label set to start at 0? - */ -template -void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false) -{ - detail::make_monotonic(out, in, N, stream, zero_based); -} -}; // namespace label -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "classlabels.cuh" diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh index a941751d78..0af1c70b91 100644 --- a/cpp/include/raft/label/detail/classlabels.cuh +++ b/cpp/include/raft/label/detail/classlabels.cuh @@ -18,9 +18,9 @@ #include -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh index 1f62b3f0d6..f93a97d52b 100644 --- a/cpp/include/raft/label/detail/merge_labels.cuh +++ b/cpp/include/raft/label/detail/merge_labels.cuh @@ -19,9 +19,9 @@ #include #include -#include -#include #include +#include +#include namespace raft { namespace label { diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp index 2bf2fa830b..7c0c25d038 100644 --- a/cpp/include/raft/label/merge_labels.hpp +++ b/cpp/include/raft/label/merge_labels.hpp @@ -13,59 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#ifndef __MERGE_LABELS_H -#define __MERGE_LABELS_H - -#pragma once - -#include - -namespace raft { -namespace label { +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. + */ /** - * @brief Merge two labellings in-place, according to a core mask - * - * A labelling is a representation of disjoint sets (groups) where points that - * belong to the same group have the same label. It is assumed that group - * labels take values between 1 and N. labels relate to points, i.e a label i+1 - * means that you belong to the same group as the point i. - * The special value MAX_LABEL is used to mark points that are not labelled. - * - * The two label arrays A and B induce two sets of groups over points 0..N-1. - * If a point is labelled i in A and j in B and the mask is true for this - * point, then i and j are equivalent labels and their groups are merged by - * relabeling the elements of both groups to have the same label. The new label - * is the smaller one from the original labels. - * It is required that if the mask is true for a point, this point is labelled - * (i.e its label is different than the special value MAX_LABEL). - * - * One use case is finding connected components: the two input label arrays can - * represent the connected components of graphs G_A and G_B, and the output - * would be the connected components labels of G_A \union G_B. - * - * @param[inout] labels_a First input, and output label array (in-place) - * @param[in] labels_b Second input label array - * @param[in] mask Core point mask - * @param[out] R label equivalence map - * @param[in] m Working flag - * @param[in] N Number of points in the dataset - * @param[in] stream CUDA stream + * DISCLAIMER: this file is deprecated: use merge_labels.cuh instead */ -template -void merge_labels(value_idx* labels_a, - const value_idx* labels_b, - const bool* mask, - value_idx* R, - bool* m, - value_idx N, - cudaStream_t stream) -{ - detail::merge_labels(labels_a, labels_b, mask, R, m, N, stream); -} -}; // namespace label -}; // namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "merge_labels.cuh" diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index e9a862e45a..ca7d5e96a9 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -1,6 +1,5 @@ /* * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * Copyright 2020 KETAN DATE & RAKESH NAGI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,283 +12,27 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - * - * CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm - * Authors: Ketan Date and Rakesh Nagi - * - * Article reference: - * Date, Ketan, and Rakesh Nagi. "GPU-accelerated Hungarian algorithms - * for the Linear Assignment Problem." Parallel Computing 57 (2016): 52-72. - * + */ +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -#ifndef __LAP_H -#define __LAP_H +/** + * DISCLAIMER: this file is deprecated: use lap.cuh instead + */ #pragma once -#include -#include - -#include -#include - -#include "detail/d_structs.h" -#include "detail/lap_functions.cuh" - -namespace raft { -namespace lap { - -template -class LinearAssignmentProblem { - vertex_t size_; - vertex_t batchsize_; - weight_t epsilon_; - - weight_t const* d_costs_; - - Vertices d_vertices_dev; - VertexData d_row_data_dev, d_col_data_dev; - - raft::handle_t const& handle_; - rmm::device_uvector row_covers_v; - rmm::device_uvector col_covers_v; - rmm::device_uvector row_duals_v; - rmm::device_uvector col_duals_v; - rmm::device_uvector col_slacks_v; - rmm::device_uvector row_is_visited_v; - rmm::device_uvector col_is_visited_v; - rmm::device_uvector row_parents_v; - rmm::device_uvector col_parents_v; - rmm::device_uvector row_children_v; - rmm::device_uvector col_children_v; - rmm::device_uvector obj_val_primal_v; - rmm::device_uvector obj_val_dual_v; - - public: - LinearAssignmentProblem(raft::handle_t const& handle, - vertex_t size, - vertex_t batchsize, - weight_t epsilon) - : handle_(handle), - size_(size), - batchsize_(batchsize), - epsilon_(epsilon), - d_costs_(nullptr), - row_covers_v(0, handle_.get_stream()), - col_covers_v(0, handle_.get_stream()), - row_duals_v(0, handle_.get_stream()), - col_duals_v(0, handle_.get_stream()), - col_slacks_v(0, handle_.get_stream()), - row_is_visited_v(0, handle_.get_stream()), - col_is_visited_v(0, handle_.get_stream()), - row_parents_v(0, handle_.get_stream()), - col_parents_v(0, handle_.get_stream()), - row_children_v(0, handle_.get_stream()), - col_children_v(0, handle_.get_stream()), - obj_val_primal_v(0, handle_.get_stream()), - obj_val_dual_v(0, handle_.get_stream()) - { - } - - // Executes Hungarian algorithm on the input cost matrix. - void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment) - { - initializeDevice(); - - d_vertices_dev.row_assignments = d_row_assignment; - d_vertices_dev.col_assignments = d_col_assignment; - - d_costs_ = d_cost_matrix; - - int step = 0; - - while (step != 100) { - switch (step) { - case 0: step = hungarianStep0(); break; - case 1: step = hungarianStep1(); break; - case 2: step = hungarianStep2(); break; - case 3: step = hungarianStep3(); break; - case 4: step = hungarianStep4(); break; - case 5: step = hungarianStep5(); break; - case 6: step = hungarianStep6(); break; - } - } - - d_costs_ = nullptr; - } - - // Function for getting optimal row dual vector for subproblem spId. - std::pair getRowDualVector(int spId) const - { - return std::make_pair(row_duals_v.data() + spId * size_, size_); - } - - // Function for getting optimal col dual vector for subproblem spId. - std::pair getColDualVector(int spId) - { - return std::make_pair(col_duals_v.data() + spId * size_, size_); - } - - // Function for getting optimal primal objective value for subproblem spId. - weight_t getPrimalObjectiveValue(int spId) - { - weight_t result; - raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream()); - CHECK_CUDA(handle_.get_stream()); - return result; - } - - // Function for getting optimal dual objective value for subproblem spId. - weight_t getDualObjectiveValue(int spId) - { - weight_t result; - raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream()); - CHECK_CUDA(handle_.get_stream()); - return result; - } - - private: - // Helper function for initializing global variables and arrays on a single host. - void initializeDevice() - { - cudaStream_t stream = handle_.get_stream(); - row_covers_v.resize(batchsize_ * size_, stream); - col_covers_v.resize(batchsize_ * size_, stream); - row_duals_v.resize(batchsize_ * size_, stream); - col_duals_v.resize(batchsize_ * size_, stream); - col_slacks_v.resize(batchsize_ * size_, stream); - row_is_visited_v.resize(batchsize_ * size_, stream); - col_is_visited_v.resize(batchsize_ * size_, stream); - row_parents_v.resize(batchsize_ * size_, stream); - col_parents_v.resize(batchsize_ * size_, stream); - row_children_v.resize(batchsize_ * size_, stream); - col_children_v.resize(batchsize_ * size_, stream); - obj_val_primal_v.resize(batchsize_, stream); - obj_val_dual_v.resize(batchsize_, stream); - - d_vertices_dev.row_covers = row_covers_v.data(); - d_vertices_dev.col_covers = col_covers_v.data(); - - d_vertices_dev.row_duals = row_duals_v.data(); - d_vertices_dev.col_duals = col_duals_v.data(); - d_vertices_dev.col_slacks = col_slacks_v.data(); - - d_row_data_dev.is_visited = row_is_visited_v.data(); - d_col_data_dev.is_visited = col_is_visited_v.data(); - d_row_data_dev.parents = row_parents_v.data(); - d_row_data_dev.children = row_children_v.data(); - d_col_data_dev.parents = col_parents_v.data(); - d_col_data_dev.children = col_children_v.data(); - - thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0}); - thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0}); - thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0}); - thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0}); - } - - // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep0() - { - detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_); - - return 1; - } - - // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep1() - { - detail::computeInitialAssignments( - handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_); - - int next = 2; - - while (true) { - if ((next = hungarianStep2()) == 6) break; - - if ((next = hungarianStep3()) == 5) break; - - hungarianStep4(); - } - - return next; - } - - // Function for checking optimality and constructing predicates and covers. - int hungarianStep2() - { - int cover_count = detail::computeRowCovers( - handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); - - int next = (cover_count == batchsize_ * size_) ? 6 : 3; - - return next; - } - - // Function for building alternating tree rooted at unassigned rows. - int hungarianStep3() - { - int next; - - rmm::device_scalar flag_v(handle_.get_stream()); - - bool h_flag = false; - flag_v.set_value_async(h_flag, handle_.get_stream()); - - detail::executeZeroCover(handle_, - d_costs_, - d_vertices_dev, - d_row_data_dev, - d_col_data_dev, - flag_v.data(), - batchsize_, - size_, - epsilon_); - - h_flag = flag_v.value(handle_.get_stream()); - - next = h_flag ? 4 : 5; - - return next; - } - - // Function for augmenting the solution along multiple node-disjoint alternating trees. - int hungarianStep4() - { - detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_); - - detail::augmentationPass( - handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); - - return 2; - } - - // Function for updating dual solution to introduce new zero-cost arcs. - int hungarianStep5() - { - detail::dualUpdate( - handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_); - - return 3; - } - - // Function for calculating primal and dual objective values at optimality. - int hungarianStep6() - { - detail::calcObjValPrimal(handle_, - obj_val_primal_v.data(), - d_costs_, - d_vertices_dev.row_assignments, - batchsize_, - size_); - - detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_); +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/solver version instead.") - return 100; - } -}; +#include -} // namespace lap -} // namespace raft +using raft::solver::VertexData; +using raft::solver::Vertices; -#endif \ No newline at end of file +namespace raft::lap { +using raft::solver::LinearAssignmentProblem; +} diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.hpp index a9f205932c..30f2b53e52 100644 --- a/cpp/include/raft/lap/lap.hpp +++ b/cpp/include/raft/lap/lap.hpp @@ -1,6 +1,5 @@ /* * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * Copyright 2020 KETAN DATE & RAKESH NAGI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -12,289 +11,21 @@ * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and - * limitations under the License.+ - * - * CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm - * Authors: Ketan Date and Rakesh Nagi - * - * Article reference: - * Date, Ketan, and Rakesh Nagi. "GPU-accelerated Hungarian algorithms - * for the Linear Assignment Problem." Parallel Computing 57 (2016): 52-72. - * + * limitations under the License. */ - /** * This file is deprecated and will be removed in release 22.06. * Please use the cuh version instead. */ -#ifndef __LAP_H -#define __LAP_H +/** + * DISCLAIMER: this file is deprecated: use lap.cuh instead + */ #pragma once -#include -#include - -#include -#include - -#include "detail/d_structs.h" -#include "detail/lap_functions.cuh" - -namespace raft { -namespace lap { - -template -class LinearAssignmentProblem { - vertex_t size_; - vertex_t batchsize_; - weight_t epsilon_; - - weight_t const* d_costs_; - - Vertices d_vertices_dev; - VertexData d_row_data_dev, d_col_data_dev; - - raft::handle_t const& handle_; - rmm::device_uvector row_covers_v; - rmm::device_uvector col_covers_v; - rmm::device_uvector row_duals_v; - rmm::device_uvector col_duals_v; - rmm::device_uvector col_slacks_v; - rmm::device_uvector row_is_visited_v; - rmm::device_uvector col_is_visited_v; - rmm::device_uvector row_parents_v; - rmm::device_uvector col_parents_v; - rmm::device_uvector row_children_v; - rmm::device_uvector col_children_v; - rmm::device_uvector obj_val_primal_v; - rmm::device_uvector obj_val_dual_v; - - public: - LinearAssignmentProblem(raft::handle_t const& handle, - vertex_t size, - vertex_t batchsize, - weight_t epsilon) - : handle_(handle), - size_(size), - batchsize_(batchsize), - epsilon_(epsilon), - d_costs_(nullptr), - row_covers_v(0, handle_.get_stream()), - col_covers_v(0, handle_.get_stream()), - row_duals_v(0, handle_.get_stream()), - col_duals_v(0, handle_.get_stream()), - col_slacks_v(0, handle_.get_stream()), - row_is_visited_v(0, handle_.get_stream()), - col_is_visited_v(0, handle_.get_stream()), - row_parents_v(0, handle_.get_stream()), - col_parents_v(0, handle_.get_stream()), - row_children_v(0, handle_.get_stream()), - col_children_v(0, handle_.get_stream()), - obj_val_primal_v(0, handle_.get_stream()), - obj_val_dual_v(0, handle_.get_stream()) - { - } - - // Executes Hungarian algorithm on the input cost matrix. - void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment) - { - initializeDevice(); - - d_vertices_dev.row_assignments = d_row_assignment; - d_vertices_dev.col_assignments = d_col_assignment; - - d_costs_ = d_cost_matrix; - - int step = 0; - - while (step != 100) { - switch (step) { - case 0: step = hungarianStep0(); break; - case 1: step = hungarianStep1(); break; - case 2: step = hungarianStep2(); break; - case 3: step = hungarianStep3(); break; - case 4: step = hungarianStep4(); break; - case 5: step = hungarianStep5(); break; - case 6: step = hungarianStep6(); break; - } - } - - d_costs_ = nullptr; - } - - // Function for getting optimal row dual vector for subproblem spId. - std::pair getRowDualVector(int spId) const - { - return std::make_pair(row_duals_v.data() + spId * size_, size_); - } - - // Function for getting optimal col dual vector for subproblem spId. - std::pair getColDualVector(int spId) - { - return std::make_pair(col_duals_v.data() + spId * size_, size_); - } - - // Function for getting optimal primal objective value for subproblem spId. - weight_t getPrimalObjectiveValue(int spId) - { - weight_t result; - raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream()); - CHECK_CUDA(handle_.get_stream()); - return result; - } - - // Function for getting optimal dual objective value for subproblem spId. - weight_t getDualObjectiveValue(int spId) - { - weight_t result; - raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream()); - CHECK_CUDA(handle_.get_stream()); - return result; - } - - private: - // Helper function for initializing global variables and arrays on a single host. - void initializeDevice() - { - cudaStream_t stream = handle_.get_stream(); - row_covers_v.resize(batchsize_ * size_, stream); - col_covers_v.resize(batchsize_ * size_, stream); - row_duals_v.resize(batchsize_ * size_, stream); - col_duals_v.resize(batchsize_ * size_, stream); - col_slacks_v.resize(batchsize_ * size_, stream); - row_is_visited_v.resize(batchsize_ * size_, stream); - col_is_visited_v.resize(batchsize_ * size_, stream); - row_parents_v.resize(batchsize_ * size_, stream); - col_parents_v.resize(batchsize_ * size_, stream); - row_children_v.resize(batchsize_ * size_, stream); - col_children_v.resize(batchsize_ * size_, stream); - obj_val_primal_v.resize(batchsize_, stream); - obj_val_dual_v.resize(batchsize_, stream); - - d_vertices_dev.row_covers = row_covers_v.data(); - d_vertices_dev.col_covers = col_covers_v.data(); - - d_vertices_dev.row_duals = row_duals_v.data(); - d_vertices_dev.col_duals = col_duals_v.data(); - d_vertices_dev.col_slacks = col_slacks_v.data(); - - d_row_data_dev.is_visited = row_is_visited_v.data(); - d_col_data_dev.is_visited = col_is_visited_v.data(); - d_row_data_dev.parents = row_parents_v.data(); - d_row_data_dev.children = row_children_v.data(); - d_col_data_dev.parents = col_parents_v.data(); - d_col_data_dev.children = col_children_v.data(); - - thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0}); - thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0}); - thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0}); - thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0}); - } - - // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep0() - { - detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_); - - return 1; - } - - // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep1() - { - detail::computeInitialAssignments( - handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_); - - int next = 2; - - while (true) { - if ((next = hungarianStep2()) == 6) break; - - if ((next = hungarianStep3()) == 5) break; - - hungarianStep4(); - } - - return next; - } - - // Function for checking optimality and constructing predicates and covers. - int hungarianStep2() - { - int cover_count = detail::computeRowCovers( - handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); - - int next = (cover_count == batchsize_ * size_) ? 6 : 3; - - return next; - } - - // Function for building alternating tree rooted at unassigned rows. - int hungarianStep3() - { - int next; - - rmm::device_scalar flag_v(handle_.get_stream()); - - bool h_flag = false; - flag_v.set_value_async(h_flag, handle_.get_stream()); - - detail::executeZeroCover(handle_, - d_costs_, - d_vertices_dev, - d_row_data_dev, - d_col_data_dev, - flag_v.data(), - batchsize_, - size_, - epsilon_); - - h_flag = flag_v.value(handle_.get_stream()); - - next = h_flag ? 4 : 5; - - return next; - } - - // Function for augmenting the solution along multiple node-disjoint alternating trees. - int hungarianStep4() - { - detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_); - - detail::augmentationPass( - handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); - - return 2; - } - - // Function for updating dual solution to introduce new zero-cost arcs. - int hungarianStep5() - { - detail::dualUpdate( - handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_); - - return 3; - } - - // Function for calculating primal and dual objective values at optimality. - int hungarianStep6() - { - detail::calcObjValPrimal(handle_, - obj_val_primal_v.data(), - d_costs_, - d_vertices_dev.row_assignments, - batchsize_, - size_); - - detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_); - - return 100; - } -}; - -} // namespace lap -} // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index e25c9df9ef..9f1d5d4a33 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -25,6 +25,10 @@ #include "detail/add.cuh" +#include +#include +#include + namespace raft { namespace linalg { @@ -46,7 +50,7 @@ using detail::adds_scalar; * @param stream cuda stream where to launch work */ template -void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +void addScalar(OutT* out, const InT* in, const InT scalar, IdxType len, cudaStream_t stream) { detail::addScalar(out, in, scalar, len, stream); } @@ -72,7 +76,9 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and * write result to outDev[i] - * @tparam math_t data-type upon which the math operation will be performed + * @tparam InT input data-type. Also the data-type upon which the math ops + * will be performed + * @tparam OutT output data-type * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer * @param inDev the input buffer @@ -80,16 +86,143 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st * @param len number of elements in the input and output buffer * @param stream cuda stream */ -template -void addDevScalar(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len, - cudaStream_t stream) +template +void addDevScalar( + OutT* outDev, const InT* inDev, const InT* singleScalarDev, IdxType len, cudaStream_t stream) { detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream); } +/** + * @defgroup add Addition Arithmetic + * @{ + */ + +/** + * @brief Elementwise add operation + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @param[in] handle raft::handle_t + * @param[in] in1 First Input + * @param[in] in2 Second Input + * @param[out] out Output + */ +template , + typename = raft::enable_if_output_device_mdspan> +void add(const raft::handle_t& handle, InType in1, InType in2, OutType out) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous"); + RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(), + "Size mismatch between Output and Inputs"); + + if (out.size() <= std::numeric_limits::max()) { + add(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + add(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** + * @brief Elementwise addition of device scalar to input + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @tparam ScalarIdxType Index Type of scalar + * @param[in] handle raft::handle_t + * @param[in] in Input + * @param[in] scalar raft::device_scalar_view + * @param[in] out Output + */ +template , + typename = raft::enable_if_output_device_mdspan> +void add_scalar(const raft::handle_t& handle, + InType in, + OutType out, + raft::device_scalar_view scalar) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + if (out.size() <= std::numeric_limits::max()) { + addDevScalar(out.data_handle(), + in.data_handle(), + scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + addDevScalar(out.data_handle(), + in.data_handle(), + scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** + * @brief Elementwise addition of host scalar to input + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @tparam ScalarIdxType Index Type of scalar + * @param[in] handle raft::handle_t + * @param[in] in Input + * @param[in] scalar raft::host_scalar_view + * @param[in] out Output + */ +template , + typename = raft::enable_if_output_device_mdspan> +void add_scalar(const raft::handle_t& handle, + const InType in, + OutType out, + raft::host_scalar_view scalar) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + if (out.size() <= std::numeric_limits::max()) { + addScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + addScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** @} */ // end of group add + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp index a80398fcad..e7f9610892 100644 --- a/cpp/include/raft/linalg/add.hpp +++ b/cpp/include/raft/linalg/add.hpp @@ -18,78 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __ADD_H -#define __ADD_H - -#pragma once - -#include "detail/add.cuh" - -namespace raft { -namespace linalg { - -using detail::adds_scalar; - -/** - * @brief Elementwise scalar add operation on the input buffer - * - * @tparam InT input data-type. Also the data-type upon which the math ops - * will be performed - * @tparam OutT output data-type - * @tparam IdxType Integer type used to for addressing - * - * @param out the output buffer - * @param in the input buffer - * @param scalar the scalar used in the operations - * @param len number of elements in the input buffer - * @param stream cuda stream where to launch work - */ -template -void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) -{ - detail::addScalar(out, in, scalar, len, stream); -} - /** - * @brief Elementwise add operation on the input buffers - * @tparam InT input data-type. Also the data-type upon which the math ops - * will be performed - * @tparam OutT output data-type - * @tparam IdxType Integer type used to for addressing - * - * @param out the output buffer - * @param in1 the first input buffer - * @param in2 the second input buffer - * @param len number of elements in the input buffers - * @param stream cuda stream where to launch work + * DISCLAIMER: this file is deprecated: use add.cuh instead */ -template -void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) -{ - detail::add(out, in1, in2, len, stream); -} -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and - * write result to outDev[i] - * @tparam math_t data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing - * @param outDev the output buffer - * @param inDev the input buffer - * @param singleScalarDev pointer to the scalar located in device memory - * @param len number of elements in the input and output buffer - * @param stream cuda stream - */ -template -void addDevScalar(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len, - cudaStream_t stream) -{ - detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream); -} +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "add.cuh" diff --git a/cpp/include/raft/linalg/axpy.cuh b/cpp/include/raft/linalg/axpy.cuh index 2e23047b5a..96cf4277f4 100644 --- a/cpp/include/raft/linalg/axpy.cuh +++ b/cpp/include/raft/linalg/axpy.cuh @@ -50,6 +50,87 @@ void axpy(const raft::handle_t& handle, detail::axpy(handle, n, alpha, x, incx, y, incy, stream); } +/** + * @defgroup axpy axpy + * @{ + */ + +/** + * @brief axpy function + * It computes the following equation: y = alpha * x + y + * + * @tparam InType Type raft::device_mdspan + * @tparam ScalarIdxType Index Type of scalar + * @param [in] handle raft::handle_t + * @param [in] alpha raft::device_scalar_view + * @param [in] x Input vector + * @param [inout] y Output vector + * @param [in] incx stride between consecutive elements of x + * @param [in] incy stride between consecutive elements of y + */ +template , + typename = raft::enable_if_output_device_mdspan> +void axpy(const raft::handle_t& handle, + raft::device_scalar_view alpha, + InType x, + OutType y, + const int incx, + const int incy) +{ + RAFT_EXPECTS(y.size() == x.size(), "Size mismatch between Output and Input"); + + axpy(handle, + y.size(), + alpha.data_handle(), + x.data_handle(), + incx, + y.data_handle(), + incy, + handle.get_stream()); +} + +/** + * @brief axpy function + * It computes the following equation: y = alpha * x + y + * + * @tparam MdspanType Type raft::device_mdspan + * @tparam ScalarIdxType Index Type of scalar + * @param [in] handle raft::handle_t + * @param [in] alpha raft::device_scalar_view + * @param [in] x Input vector + * @param [inout] y Output vector + * @param [in] incx stride between consecutive elements of x + * @param [in] incy stride between consecutive elements of y + */ +template , + typename = raft::enable_if_output_device_mdspan> +void axpy(const raft::handle_t& handle, + raft::host_scalar_view alpha, + InType x, + OutType y, + const int incx, + const int incy) +{ + RAFT_EXPECTS(y.size() == x.size(), "Size mismatch between Output and Input"); + + axpy(handle, + y.size(), + alpha.data_handle(), + x.data_handle(), + incx, + y.data_handle(), + incy, + handle.get_stream()); +} + +/** @} */ // end of group axpy + } // namespace raft::linalg #endif \ No newline at end of file diff --git a/cpp/include/raft/linalg/axpy.hpp b/cpp/include/raft/linalg/axpy.hpp index c227ba66c8..8db4c5a6e8 100644 --- a/cpp/include/raft/linalg/axpy.hpp +++ b/cpp/include/raft/linalg/axpy.hpp @@ -18,43 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __AXPY_H -#define __AXPY_H - -#pragma once - -#include "detail/axpy.cuh" - -namespace raft::linalg { - /** - * @brief the wrapper of cublas axpy function - * It computes the following equation: y = alpha * x + y - * - * @tparam T the element type - * @tparam DevicePointerMode whether pointers alpha, beta point to device memory - * @param [in] handle raft handle - * @param [in] n number of elements in x and y - * @param [in] alpha host or device scalar - * @param [in] x vector of length n - * @param [in] incx stride between consecutive elements of x - * @param [inout] y vector of length n - * @param [in] incy stride between consecutive elements of y - * @param [in] stream + * DISCLAIMER: this file is deprecated: use axpy.cuh instead */ -template -void axpy(const raft::handle_t& handle, - const int n, - const T* alpha, - const T* x, - const int incx, - T* y, - const int incy, - cudaStream_t stream) -{ - detail::axpy(handle, n, alpha, x, incx, y, incy, stream); -} -} // namespace raft::linalg +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "axpy.cuh" diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index a85bf698f7..693ef961c2 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -20,7 +20,10 @@ #include "detail/binary_op.cuh" -#include +#include +#include +#include +#include namespace raft { namespace linalg { @@ -52,6 +55,51 @@ void binaryOp( detail::binaryOp(out, in1, in2, len, op, stream); } +/** + * @defgroup binary_op Element-Wise Binary Operation + * @{ + */ + +/** + * @brief perform element-wise binary operation on the input arrays + * @tparam InType Input Type raft::device_mdspan + * @tparam Lambda the device-lambda performing the actual operation + * @tparam OutType Output Type raft::device_mdspan + * @param[in] handle raft::handle_t + * @param[in] in1 First input + * @param[in] in2 Second input + * @param[out] out Output + * @param[in] op the device-lambda + * @note Lambda must be a functor with the following signature: + * `OutType func(const InType& val1, const InType& val2);` + */ +template , + typename = raft::enable_if_output_device_mdspan> +void binary_op(const raft::handle_t& handle, InType in1, InType in2, OutType out, Lambda op) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous"); + RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(), + "Size mismatch between Output and Inputs"); + + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + if (out.size() <= std::numeric_limits::max()) { + binaryOp( + out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream()); + } else { + binaryOp( + out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream()); + } +} + +/** @} */ // end of group binary_op + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp index 9983e8ab50..f0a54cb164 100644 --- a/cpp/include/raft/linalg/binary_op.hpp +++ b/cpp/include/raft/linalg/binary_op.hpp @@ -18,46 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __BINARY_OP_H -#define __BINARY_OP_H - -#pragma once - -#include "detail/binary_op.cuh" - -#include - -namespace raft { -namespace linalg { - /** - * @brief perform element-wise binary operation on the input arrays - * @tparam InType input data-type - * @tparam Lambda the device-lambda performing the actual operation - * @tparam OutType output data-type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * @param out the output array - * @param in1 the first input array - * @param in2 the second input array - * @param len number of elements in the input array - * @param op the device-lambda - * @param stream cuda stream where to launch work - * @note Lambda must be a functor with the following signature: - * `OutType func(const InType& val1, const InType& val2);` + * DISCLAIMER: this file is deprecated: use binary_op.cuh instead */ -template -void binaryOp( - OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) -{ - detail::binaryOp(out, in1, in2, len, op, stream); -} -}; // end namespace linalg -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "binary_op.cuh" diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index d8e838a634..f40866b235 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -25,6 +25,7 @@ namespace linalg { /** * @brief Rank 1 update of Cholesky decomposition. + * NOTE: The new mdspan-based API will not be provided for this function. * * This method is useful if an algorithm iteratively builds up matrix A, and * the Cholesky decomposition of A is required at each step. @@ -109,7 +110,7 @@ namespace linalg { * @param L device array for to store the triangular matrix L, and the new * column of A in column major format, size [n*n] * @param n number of elements in the new row. - * @param ld stride of colums in L + * @param ld stride of columns in L * @param workspace device pointer to workspace shall be nullptr ar an array * of size [n_bytes]. * @param n_bytes size of workspace is returned here if workspace==nullptr. @@ -132,6 +133,7 @@ void choleskyRank1Update(const raft::handle_t& handle, { detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps); } + }; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp index 1158ad3aa4..a1967c36cb 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp @@ -18,126 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __CHOLESKY_R1_UPDATE_H -#define __CHOLESKY_R1_UPDATE_H +/** + * DISCLAIMER: this file is deprecated: use cholesky_r1_update.cuh instead + */ #pragma once -#include "detail/cholesky_r1_update.cuh" - -namespace raft { -namespace linalg { - -/** - * @brief Rank 1 update of Cholesky decomposition. - * - * This method is useful if an algorithm iteratively builds up matrix A, and - * the Cholesky decomposition of A is required at each step. - * - * On entry, L is the Cholesky decomposition of matrix A, where both A and L - * have size n-1 x n-1. We are interested in the Cholesky decomposition of a new - * matrix A', which we get by adding a row and column to A. In Python notation: - * - A'[0:n-1, 0:n-1] = A; - * - A'[:,n-1] = A[n-1,:] = A_new - * - * On entry, the new column A_new, is stored as the n-th column of L if uplo == - * CUBLAS_FILL_MODE_UPPER, else A_new is stored as the n-th row of L. - * - * On exit L contains the Cholesky decomposition of A'. In practice the elements - * of A_new are overwritten with new row/column of the L matrix. - * - * The uplo paramater is used to select the matrix layout. - * If (uplo != CUBLAS_FILL_MODE_UPPER) then the input arg L stores the - * lower triangular matrix L, so that A = L * L.T. Otherwise the input arg L - * stores an upper triangular matrix U: A = U.T * U. - * - * On exit L will be updated to store the Cholesky decomposition of A'. - * - * If the matrix is not positive definit, or very ill conditioned then the new - * diagonal element of L would be NaN. In such a case an exception is thrown. - * The eps argument can be used to override this behavior: if eps >= 0 then - * the diagonal element is replaced by eps in case the diagonal is NaN or - * smaller than eps. Note: for an iterative solver it is probably better to - * stop early in case of error, rather than relying on the eps parameter. - * - * Examples: - * - * - Lower triangular factorization: - * @code{.cpp} - * // Initialize arrays - * int ld_L = n_rows; - * rmm::device_uvector L(ld_L * n_rows, stream); - * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr, - * &n_bytes, CUBLAS_FILL_MODE_LOWER, - * stream); - * rmm::device_uvector workspace(n_bytes, stream); - * - * for (n=1; n<=n_rows; rank++) { - * // Calculate a new row/column of matrix A into A_new - * // ... - * // Copy new row to L[rank-1,:] - * RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, - * L + n - 1, ld_L, stream)); - * // Update Cholesky factorization - * raft::linalg::choleskyRank1Update( - * handle, L, rank, ld_L, workspace, &n_bytes, CUBLAS_FILL_MODE_LOWER, - * stream); - * } - * Now L stores the Cholesky decomposition of A: A = L * L.T - * @endcode - * - * - Upper triangular factorization: - * @code{.cpp} - * // Initialize arrays - * int ld_U = n_rows; - * rmm::device_uvector U(ld_U * n_rows, stream); - * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr, - * &n_bytes, CUBLAS_FILL_MODE_UPPER, - * stream); - * rmm::device_uvector workspace(stream, n_bytes, stream); - * - * for (n=1; n<=n_rows; n++) { - * // Calculate a new row/column of matrix A into array A_new - * // ... - * // Copy new row to U[:,n-1] (column major layout) - * raft::copy(U + ld_U * (n-1), A_new, n-1, stream); - * // - * // Update Cholesky factorization - * raft::linalg::choleskyRank1Update( - * handle, U, n, ld_U, workspace, &n_bytes, CUBLAS_FILL_MODE_UPPER, - * stream); - * } - * // Now U stores the Cholesky decomposition of A: A = U.T * U - * @endcode - * - * @param handle RAFT handle (used to retrive cuBLAS handles). - * @param L device array for to store the triangular matrix L, and the new - * column of A in column major format, size [n*n] - * @param n number of elements in the new row. - * @param ld stride of colums in L - * @param workspace device pointer to workspace shall be nullptr ar an array - * of size [n_bytes]. - * @param n_bytes size of workspace is returned here if workspace==nullptr. - * @param stream CUDA stream - * @param uplo indicates whether L is stored as an upper or lower triangular - * matrix (CUBLAS_FILL_MODE_UPPER or CUBLAS_FILL_MODE_LOWER) - * @param eps numerical parameter that can act as a regularizer for ill - * conditioned systems. Negative values mean no regularizaton. - */ -template -void choleskyRank1Update(const raft::handle_t& handle, - math_t* L, - int n, - int ld, - void* workspace, - int* n_bytes, - cublasFillMode_t uplo, - cudaStream_t stream, - math_t eps = -1) -{ - detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps); -} -}; // namespace linalg -}; // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "cholesky_r1_update.cuh" diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh index 03477f72d6..6ef0d52e62 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -20,6 +20,9 @@ #include "detail/coalesced_reduction.cuh" +#include +#include + namespace raft { namespace linalg { @@ -58,8 +61,8 @@ template > void coalescedReduction(OutType* dots, const InType* data, - int D, - int N, + IdxType D, + IdxType N, OutType init, cudaStream_t stream, bool inplace = false, @@ -67,9 +70,94 @@ void coalescedReduction(OutType* dots, ReduceLambda reduce_op = raft::Sum(), FinalLambda final_op = raft::Nop()) { - detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + detail::coalescedReduction( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } +/** + * @defgroup coalesced_reduction Coalesced Memory Access Reductions + * For reducing along rows for col-major and along columns for row-major + * @{ + */ + +/** + * @brief Compute reduction of the input matrix along the leading dimension + * This API is to be used when the desired reduction is along the dimension + * of the memory layout. For example, a row-major matrix will be reduced + * along the columns whereas a column-major matrix will be reduced along + * the rows. + * + * @tparam InValueType the input data-type of underlying raft::matrix_view + * @tparam LayoutPolicy The layout of Input/Output (row or col major) + * @tparam OutValueType the output data-type of underlying raft::matrix_view and reduction + * @tparam IndexType Integer type used to for addressing + * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*MainLambda)(InType, IdxType);
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*ReduceLambda)(OutType);
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*FinalLambda)(OutType);
+ * @param handle raft::handle_t + * @param[in] data Input of type raft::device_matrix_view + * @param[out] dots Output of type raft::device_matrix_view + * @param[in] init initial value to use for the reduction + * @param[in] inplace reduction result added inplace or overwrites old values? + * @param[in] main_op fused elementwise operation to apply before reduction + * @param[in] reduce_op fused binary reduction operation + * @param[in] final_op fused elementwise operation to apply before storing results + */ +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void coalesced_reduction(const raft::handle_t& handle, + raft::device_matrix_view data, + raft::device_vector_view dots, + OutValueType init, + bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) +{ + if constexpr (std::is_same_v) { + RAFT_EXPECTS(static_cast(dots.size()) == data.extent(0), + "Output should be equal to number of rows in Input"); + + coalescedReduction(dots.data_handle(), + data.data_handle(), + data.extent(1), + data.extent(0), + init, + handle.get_stream(), + inplace, + main_op, + reduce_op, + final_op); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS(static_cast(dots.size()) == data.extent(1), + "Output should be equal to number of columns in Input"); + + coalescedReduction(dots.data_handle(), + data.data_handle(), + data.extent(0), + data.extent(1), + init, + handle.get_stream(), + inplace, + main_op, + reduce_op, + final_op); + } +} + +/** @} */ // end of group coalesced_reduction + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp index 48f8798a03..8631a7e5ba 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.hpp +++ b/cpp/include/raft/linalg/coalesced_reduction.hpp @@ -18,64 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __COALESCED_REDUCTION_H -#define __COALESCED_REDUCTION_H - -#pragma once - -#include "detail/coalesced_reduction.cuh" - -namespace raft { -namespace linalg { - /** - * @brief Compute reduction of the input matrix along the leading dimension - * - * @tparam InType the data type of the input - * @tparam OutType the data type of the output (as well as the data type for - * which reduction is performed) - * @tparam IdxType data type of the indices of the array - * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*MainLambda)(InType, IdxType);
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*ReduceLambda)(OutType);
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*FinalLambda)(OutType);
- * @param dots the output reduction vector - * @param data the input matrix - * @param D leading dimension of data - * @param N second dimension data - * @param init initial value to use for the reduction - * @param main_op elementwise operation to apply before reduction - * @param reduce_op binary reduction operation - * @param final_op elementwise operation to apply before storing results - * @param inplace reduction result added inplace or overwrites old values? - * @param stream cuda stream where to launch work + * DISCLAIMER: this file is deprecated: use coalesced_reduction.cuh instead */ -template , - typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void coalescedReduction(OutType* dots, - const InType* data, - int D, - int N, - OutType init, - cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), - ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) -{ - detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); -} -}; // end namespace linalg -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "coalesced_reduction.cuh" diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index 5ccbd15c3d..8aed0cb4be 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -167,6 +167,28 @@ struct Policy4x4 { }; /** @} */ +/** + * A smaller k-block (8 instead of 32) with fewer threads per block (8x8 instead + * of 16x16), which is faster for raft::distance::fusedL2NN on skinny matrices, + * i.e., matrices with a small k dimension. + * + */ +template +struct Policy4x4Skinny { +}; + +template +struct Policy4x4Skinny { + typedef KernelPolicy Policy; + typedef ColKernelPolicy ColPolicy; +}; + +template +struct Policy4x4Skinny { + typedef KernelPolicy Policy; + typedef ColKernelPolicy ColPolicy; +}; + /** * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16 * @{ diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp index 256593d9ae..7e5e9be403 100644 --- a/cpp/include/raft/linalg/contractions.hpp +++ b/cpp/include/raft/linalg/contractions.hpp @@ -18,199 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __CONTRACTIONS_H -#define __CONTRACTIONS_H - -#pragma once - -#include "detail/contractions.cuh" - -namespace raft { -namespace linalg { - -/** - * @brief This is the central enum that should be used to configure the perf - * landscape of the Contraction kernel. - * - * Main goal of this Policy struct is to provide sufficient knobs to tune the - * perf of Contraction kernel, as and when we see matrices of different shapes. - * - * @tparam DataT the IO and math datatype - * @tparam _veclen number of k-elements loaded by each thread for every LDG call - * it makes. This should be configured based on the input 'k' - * value and the input data type. For eg: if DataT = float and - * k is multiples of 4, then setting this to 4 gives the best - * LDG pattern. Possible values are {1, 2, 4}. - * @tparam _kblk number of k-elements operated upon per main-loop iteration. - * Therefore total number of main-loop iterations will be - * `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note - * that bigger this value, the greater shared mem requirement. - * @tparam _rpt Defines the number of rows that a given thread accumulates on. - * This directly results in increased register pressure. This - * also is used to compute the number of m-elements worked upon - * by each thread block. - * @tparam _cpt Defines the number of cols that a given thread accumulates on. - * This directly results in increased register pressure. This - * also is used to compute the number of n-elements worked upon - * by each thread block. - * @tparam _tr Number of threads working on the same output column. This is - * used to compute the number of m-elements worked upon by each - * thread block. This also determines the number of threads per - * thread block - * @tparam _tc Number of threads working on the same output row. This is - * used to compute the number of m-elements worked upon by each - * thread block. This also determines the number of threads per - * thread block - */ -template -struct KernelPolicy { - enum { - /** number of elements along K worked upon per main loop iteration */ - Kblk = _kblk, - /** number of elements loaded per LDG */ - Veclen = _veclen, - /** number of rows a thread works on for accumulation */ - AccRowsPerTh = _rpt, - /** number of cols a thread works on for accumulation */ - AccColsPerTh = _cpt, - /** number of threads working the same output col */ - AccThRows = _tr, - /** number of threads working the same output row */ - AccThCols = _tc, - /** total threads per block */ - Nthreads = AccThRows * AccThCols, - /** output tile size along rows */ - Mblk = AccRowsPerTh * AccThRows, - /** output tile size along cols */ - Nblk = AccColsPerTh * AccThCols, - /** number of threads loading a single row */ - LdgThRow = Kblk / Veclen, - /** number of LDGs issued by a single thread for X */ - LdgPerThX = Mblk * LdgThRow / Nthreads, - /** number of LDGs issued by a single thread for Y */ - LdgPerThY = Nblk * LdgThRow / Nthreads, - /** number of rows of X covered per LDG */ - LdgRowsX = Mblk / LdgPerThX, - /** number of rows of Y covered per LDG */ - LdgRowsY = Nblk / LdgPerThY, - /** stride for accessing X/Y data in shared mem */ - SmemStride = Kblk + Veclen, - /** size of one page for storing X data */ - SmemPageX = SmemStride * Mblk, - /** size of one page for storing Y data */ - SmemPageY = SmemStride * Nblk, - /** size of one smem page */ - SmemPage = SmemPageX + SmemPageY, - /** size (in B) for smem needed */ - SmemSize = 2 * SmemPage * sizeof(DataT), - }; // enum - -}; // struct KernelPolicy - -template -struct ColKernelPolicy { - enum { - /** number of elements along K worked upon per main loop iteration */ - Kblk = _kblk, - /** number of elements loaded per LDG */ - Veclen = _veclen, - /** number of rows a thread works on for accumulation */ - AccRowsPerTh = _rpt, - /** number of cols a thread works on for accumulation */ - AccColsPerTh = _cpt, - /** number of threads working the same output col */ - AccThRows = _tr, - /** number of threads working the same output row */ - AccThCols = _tc, - /** total threads per block */ - Nthreads = AccThRows * AccThCols, - /** output tile size along rows */ - Mblk = AccRowsPerTh * AccThRows, - /** output tile size along cols */ - Nblk = AccColsPerTh * AccThCols, - /** number of threads loading a single col */ - LdgThRow = Mblk / Veclen, - /** number of LDGs issued by a single thread for X */ - LdgPerThX = Kblk * LdgThRow / Nthreads, - /** number of LDGs issued by a single thread for Y */ - LdgPerThY = Kblk * LdgThRow / Nthreads, - /** number of rows of X covered per LDG */ - LdgRowsX = Kblk / LdgPerThX, - /** number of rows of Y covered per LDG */ - LdgRowsY = Kblk / LdgPerThY, - /** stride for accessing X/Y data in shared mem */ - SmemStride = Mblk + Veclen, - /** size of one page for storing X data */ - SmemPageX = SmemStride * Kblk, - /** size of one page for storing Y data */ - SmemPageY = SmemStride * Kblk, - /** size of one smem page */ - SmemPage = SmemPageX + SmemPageY, - /** size (in B) for smem needed */ - SmemSize = 2 * SmemPage * sizeof(DataT), - }; // colMajor enum - static_assert(Mblk == Nblk, "Mblk should be equal to Nblk"); -}; /** - * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32 - * @{ + * DISCLAIMER: this file is deprecated: use contractions.cuh instead */ -template -struct Policy4x4 { -}; - -template -struct Policy4x4 { - typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; -}; -template -struct Policy4x4 { - typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; -}; -/** @} */ - -/** - * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16 - * @{ - */ -template -struct Policy2x8 { -}; - -template -struct Policy2x8 { - typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; -}; - -template -struct Policy2x8 { - // this is not used just for keeping compiler happy. - typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; -}; -/** @} */ - -/** - * @brief Base class for gemm-like NT contractions - * - * This class does not provide any arithmetic operations, but only provides the - * memory-related operations of loading the `x` and `y` matrix blocks from the - * global memory into shared memory and then from shared into registers. Thus, - * this class acts as a basic building block for further composing gemm-like NT - * contractions on input matrices which are row-major (and so does the output) - * - * @tparam DataT IO and math data type - * @tparam IdxT indexing type - * @tparam Policy policy used to customize memory access behavior. - * See documentation for `KernelPolicy` to know more. - */ -using detail::Contractions_NT; +#pragma once -} // namespace linalg -} // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "contractions.cuh" diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh index 288ac228c9..34966ebbc2 100644 --- a/cpp/include/raft/linalg/detail/add.cuh +++ b/cpp/include/raft/linalg/detail/add.cuh @@ -18,9 +18,9 @@ #include "functional.cuh" -#include #include #include +#include #include @@ -40,27 +40,24 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st raft::linalg::binaryOp(out, in1, in2, len, thrust::plus(), stream); } -template -__global__ void add_dev_scalar_kernel(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, +template +__global__ void add_dev_scalar_kernel(OutT* outDev, + const InT* inDev, + const InT* singleScalarDev, IdxType len) { IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } } -template -void addDevScalar(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len, - cudaStream_t stream) +template +void addDevScalar( + OutT* outDev, const InT* inDev, const InT* singleScalarDev, IdxType len, cudaStream_t stream) { // TODO: block dimension has not been tuned dim3 block(256); dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); + add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/detail/axpy.cuh b/cpp/include/raft/linalg/detail/axpy.cuh index c0ce398de9..f3e1a177c8 100644 --- a/cpp/include/raft/linalg/detail/axpy.cuh +++ b/cpp/include/raft/linalg/detail/axpy.cuh @@ -20,7 +20,7 @@ #include "cublas_wrappers.hpp" -#include +#include namespace raft::linalg::detail { diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh index 6b1f8bc6d7..d073e164fd 100644 --- a/cpp/include/raft/linalg/detail/binary_op.cuh +++ b/cpp/include/raft/linalg/detail/binary_op.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh index df1fb0a1f3..a1d6ebbe6e 100644 --- a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh @@ -18,7 +18,7 @@ #include "cublas_wrappers.hpp" #include "cusolver_wrappers.hpp" -#include +#include #include namespace raft { diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh index 7e545e4932..cf1b8cf5a5 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh index 0261d1967e..5d83f88e71 100644 --- a/cpp/include/raft/linalg/detail/contractions.cuh +++ b/cpp/include/raft/linalg/detail/contractions.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp index a55e1d6d7c..03975b1b7d 100644 --- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp index e7da615748..3eff920dd8 100644 --- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/linalg/detail/divide.cuh b/cpp/include/raft/linalg/detail/divide.cuh index cb46ae76de..333cd3e83c 100644 --- a/cpp/include/raft/linalg/detail/divide.cuh +++ b/cpp/include/raft/linalg/detail/divide.cuh @@ -17,16 +17,18 @@ #pragma once #include "functional.cuh" + +#include #include namespace raft { namespace linalg { namespace detail { -template -void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +template +void divideScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) { - raft::linalg::unaryOp(out, in, len, divides_scalar(scalar), stream); + raft::linalg::unaryOp(out, in, len, divides_scalar(scalar), stream); } }; // end namespace detail diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index 1d9a6bfa8f..d48b42fc57 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -18,9 +18,9 @@ #include "cusolver_wrappers.hpp" #include -#include -#include +#include #include +#include #include #include @@ -139,9 +139,9 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; template void eigSelDC(const raft::handle_t& handle, math_t* in, - int n_rows, - int n_cols, - int n_eig_vals, + std::size_t n_rows, + std::size_t n_cols, + std::size_t n_eig_vals, math_t* eig_vectors, math_t* eig_vals, EigVecMemUsage memUsage, @@ -156,13 +156,13 @@ void eigSelDC(const raft::handle_t& handle, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, CUBLAS_FILL_MODE_UPPER, - n_rows, + static_cast(n_rows), in, - n_cols, + static_cast(n_cols), math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, - n_cols, + static_cast(n_cols - n_eig_vals + 1), + static_cast(n_cols), &h_meig, eig_vals, &lwork)); @@ -176,13 +176,13 @@ void eigSelDC(const raft::handle_t& handle, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, CUBLAS_FILL_MODE_UPPER, - n_rows, + static_cast(n_rows), in, - n_cols, + static_cast(n_cols), math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, - n_cols, + static_cast(n_cols - n_eig_vals + 1), + static_cast(n_cols), &h_meig, eig_vals, d_work.data(), @@ -197,13 +197,13 @@ void eigSelDC(const raft::handle_t& handle, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, CUBLAS_FILL_MODE_UPPER, - n_rows, + static_cast(n_rows), eig_vectors, - n_cols, + static_cast(n_cols), math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, - n_cols, + static_cast(n_cols - n_eig_vals + 1), + static_cast(n_cols), &h_meig, eig_vals, d_work.data(), @@ -230,8 +230,8 @@ void eigSelDC(const raft::handle_t& handle, template void eigJacobi(const raft::handle_t& handle, const math_t* in, - int n_rows, - int n_cols, + std::size_t n_rows, + std::size_t n_cols, math_t* eig_vectors, math_t* eig_vals, cudaStream_t stream, @@ -249,9 +249,9 @@ void eigJacobi(const raft::handle_t& handle, RAFT_CUSOLVER_TRY(cusolverDnsyevj_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, - n_rows, + static_cast(n_rows), eig_vectors, - n_cols, + static_cast(n_cols), eig_vals, &lwork, syevj_params)); @@ -264,9 +264,9 @@ void eigJacobi(const raft::handle_t& handle, RAFT_CUSOLVER_TRY(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, - n_rows, + static_cast(n_rows), eig_vectors, - n_cols, + static_cast(n_cols), eig_vals, d_work.data(), lwork, diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 50a8be6018..baa066984b 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -20,7 +20,7 @@ #include "cublas_wrappers.hpp" -#include +#include namespace raft { namespace linalg { @@ -148,7 +148,7 @@ void gemm(const raft::handle_t& handle, handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } -template +template void gemm(const raft::handle_t& handle, T* z, T* x, @@ -160,10 +160,11 @@ void gemm(const raft::handle_t& handle, bool isXColMajor, bool isYColMajor, cudaStream_t stream, - T alpha = T(1.0), - T beta = T(0.0)) + T* alpha, + T* beta) { cublasHandle_t cublas_h = handle.get_cublas_handle(); + cublas_device_pointer_mode pmode(cublas_h); cublasOperation_t trans_a, trans_b; T *a, *b, *c; @@ -233,7 +234,7 @@ void gemm(const raft::handle_t& handle, } // Actual cuBLAS call RAFT_CUBLAS_TRY( - cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); + cublasgemm(cublas_h, trans_a, trans_b, M, N, K, alpha, a, lda, b, ldb, beta, c, ldc, stream)); } } // namespace detail diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp index ad2e5275cb..38fcdcd82e 100644 --- a/cpp/include/raft/linalg/detail/gemv.hpp +++ b/cpp/include/raft/linalg/detail/gemv.hpp @@ -20,7 +20,7 @@ #include "cublas_wrappers.hpp" -#include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh index 9fa0d79875..5a3c595512 100644 --- a/cpp/include/raft/linalg/detail/lanczos.cuh +++ b/cpp/include/raft/linalg/detail/lanczos.cuh @@ -26,11 +26,11 @@ #include #include "cublas_wrappers.hpp" -#include -#include +#include #include #include #include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/detail/lstsq.cuh b/cpp/include/raft/linalg/detail/lstsq.cuh index 4ce8275e08..1273956b21 100644 --- a/cpp/include/raft/linalg/detail/lstsq.cuh +++ b/cpp/include/raft/linalg/detail/lstsq.cuh @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -30,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index 56f1dd6f19..add003eb52 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -17,28 +17,38 @@ #pragma once #include -#include -#include -#include +#include +#include +#include namespace raft { namespace linalg { namespace detail { -template -__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args) +template +__global__ void mapKernel(OutType* out, IdxType len, MapOp map, const InType* in, Args... args) { auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); if (idx < len) { out[idx] = map(in[idx], args[idx]...); } } -template +template void mapImpl( - OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) + OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args) { - const int nblks = raft::ceildiv(len, (size_t)TPB); - mapKernel + const int nblks = raft::ceildiv(len, (IdxType)TPB); + mapKernel <<>>(out, len, map, in, args...); RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh index 281861b2f9..7ef9ca1c43 100644 --- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh @@ -17,9 +17,9 @@ #pragma once #include -#include -#include -#include +#include +#include +#include namespace raft { namespace linalg { @@ -48,12 +48,13 @@ __device__ void reduce(OutType* out, const InType acc, ReduceLambda op) template __global__ void mapThenReduceKernel(OutType* out, - size_t len, + IdxType len, OutType neutral, MapOp map, ReduceLambda op, @@ -72,12 +73,13 @@ __global__ void mapThenReduceKernel(OutType* out, template void mapThenReduceImpl(OutType* out, - size_t len, + IdxType len, OutType neutral, MapOp map, ReduceLambda op, @@ -86,8 +88,8 @@ void mapThenReduceImpl(OutType* out, Args... args) { raft::update_device(out, &neutral, 1, stream); - const int nblks = raft::ceildiv(len, (size_t)TPB); - mapThenReduceKernel + const int nblks = raft::ceildiv(len, IdxType(TPB)); + mapThenReduceKernel <<>>(out, len, neutral, map, op, in, args...); RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/detail/multiply.cuh b/cpp/include/raft/linalg/detail/multiply.cuh index ec3ec802de..f1a8548bfa 100644 --- a/cpp/include/raft/linalg/detail/multiply.cuh +++ b/cpp/include/raft/linalg/detail/multiply.cuh @@ -23,7 +23,8 @@ namespace linalg { namespace detail { template -void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +void multiplyScalar( + math_t* out, const math_t* in, const math_t scalar, IdxType len, cudaStream_t stream) { raft::linalg::unaryOp( out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); diff --git a/cpp/include/raft/linalg/detail/norm.cuh b/cpp/include/raft/linalg/detail/norm.cuh index 03d03497e9..a0b557211c 100644 --- a/cpp/include/raft/linalg/detail/norm.cuh +++ b/cpp/include/raft/linalg/detail/norm.cuh @@ -37,32 +37,32 @@ void rowNormCaller(Type* dots, { switch (type) { case L1Norm: - raft::linalg::reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - true, - stream, - false, - raft::L1Op(), - raft::Sum(), - fin_op); + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - raft::linalg::reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - true, - stream, - false, - raft::L2Op(), - raft::Sum(), - fin_op); + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; @@ -80,32 +80,32 @@ void colNormCaller(Type* dots, { switch (type) { case L1Norm: - raft::linalg::reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - false, - stream, - false, - raft::L1Op(), - raft::Sum(), - fin_op); + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - raft::linalg::reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - false, - stream, - false, - raft::L2Op(), - raft::Sum(), - fin_op); + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh index 4aa843081e..74e9c3e1aa 100644 --- a/cpp/include/raft/linalg/detail/qr.cuh +++ b/cpp/include/raft/linalg/detail/qr.cuh @@ -28,6 +28,60 @@ namespace raft { namespace linalg { namespace detail { +/** + * @brief Calculate the QR decomposition and get matrix Q in place of the input. + * + * Subject to the algorithm constraint `n_rows >= n_cols`. + * + * @param handle + * @param[inout] Q device pointer to input matrix and the output matrix Q, + * both column-major and of size [n_rows, n_cols]. + * @param n_rows + * @param n_cols + * @param stream + */ +template +void qrGetQ_inplace( + const raft::handle_t& handle, math_t* Q, int n_rows, int n_cols, cudaStream_t stream) +{ + RAFT_EXPECTS(n_rows >= n_cols, "QR decomposition expects n_rows >= n_cols."); + cusolverDnHandle_t cusolver = handle.get_cusolver_dn_handle(); + + rmm::device_uvector tau(n_cols, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * n_cols, stream)); + + rmm::device_scalar dev_info(stream); + int ws_size; + + RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(cusolver, n_rows, n_cols, Q, n_rows, &ws_size)); + rmm::device_uvector workspace(ws_size, stream); + RAFT_CUSOLVER_TRY(cusolverDngeqrf(cusolver, + n_rows, + n_cols, + Q, + n_rows, + tau.data(), + workspace.data(), + ws_size, + dev_info.data(), + stream)); + + RAFT_CUSOLVER_TRY( + cusolverDnorgqr_bufferSize(cusolver, n_rows, n_cols, n_cols, Q, n_rows, tau.data(), &ws_size)); + workspace.resize(ws_size, stream); + RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolver, + n_rows, + n_cols, + n_cols, + Q, + n_rows, + tau.data(), + workspace.data(), + ws_size, + dev_info.data(), + stream)); +} + template void qrGetQ(const raft::handle_t& handle, const math_t* M, @@ -36,27 +90,8 @@ void qrGetQ(const raft::handle_t& handle, int n_cols, cudaStream_t stream) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - - int m = n_rows, n = n_cols; - int k = std::min(m, n); - RAFT_CUDA_TRY(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); - - rmm::device_uvector tau(k, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); - - rmm::device_scalar devInfo(stream); - int Lwork; - - RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); - rmm::device_uvector workspace(Lwork, stream); - RAFT_CUSOLVER_TRY(cusolverDngeqrf( - cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); - - RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); - workspace.resize(Lwork, stream); - RAFT_CUSOLVER_TRY(cusolverDnorgqr( - cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); + raft::copy(Q, M, n_rows * n_cols, stream); + qrGetQ_inplace(handle, Q, n_rows, n_cols, stream); } template diff --git a/cpp/include/raft/linalg/detail/reduce.cuh b/cpp/include/raft/linalg/detail/reduce.cuh index 4d5fa87202..3022973b43 100644 --- a/cpp/include/raft/linalg/detail/reduce.cuh +++ b/cpp/include/raft/linalg/detail/reduce.cuh @@ -16,9 +16,9 @@ #pragma once -#include #include #include +#include namespace raft { namespace linalg { @@ -32,8 +32,8 @@ template > void reduce(OutType* dots, const InType* data, - int D, - int N, + IdxType D, + IdxType N, OutType init, bool rowMajor, bool alongRows, @@ -44,16 +44,16 @@ void reduce(OutType* dots, FinalLambda final_op = raft::Nop()) { if (rowMajor && alongRows) { - raft::linalg::coalescedReduction( + raft::linalg::coalescedReduction( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (rowMajor && !alongRows) { - raft::linalg::stridedReduction( + raft::linalg::stridedReduction( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (!rowMajor && alongRows) { - raft::linalg::stridedReduction( + raft::linalg::stridedReduction( dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } else { - raft::linalg::coalescedReduction( + raft::linalg::coalescedReduction( dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } } diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh index 54cf9aa204..b956fa900e 100644 --- a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh @@ -18,7 +18,7 @@ #include #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh index 7550ce2093..9ddcbae20b 100644 --- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include #include @@ -95,16 +95,16 @@ struct quadSum { template __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) - __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A, + __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT* d_A, int lda, const char* d_keys, const WeightT* d_weights, int nrows, int ncols, int nkeys, - DataIteratorT d_sums) + DataIteratorT* d_sums) { - typedef typename std::iterator_traits::value_type DataType; + typedef typename std::iterator_traits::value_type DataType; typedef cub::BlockReduce, SUM_ROWS_SMALL_K_DIMX> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -158,14 +158,14 @@ __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) } template -void sum_rows_by_key_small_nkeys(const DataIteratorT d_A, +void sum_rows_by_key_small_nkeys(const DataIteratorT* d_A, int lda, const char* d_keys, const WeightT* d_weights, int nrows, int ncols, int nkeys, - DataIteratorT d_sums, + DataIteratorT* d_sums, cudaStream_t st) { dim3 grid, block; @@ -189,18 +189,18 @@ void sum_rows_by_key_small_nkeys(const DataIteratorT d_A, #define SUM_ROWS_BY_KEY_LARGE_K_MAX_K 1024 template -__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A, +__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT* d_A, int lda, - const KeysIteratorT d_keys, + KeysIteratorT d_keys, const WeightT* d_weights, int nrows, int ncols, int key_offset, int nkeys, - DataIteratorT d_sums) + DataIteratorT* d_sums) { typedef typename std::iterator_traits::value_type KeyType; - typedef typename std::iterator_traits::value_type DataType; + typedef typename std::iterator_traits::value_type DataType; __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K]; for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) @@ -238,14 +238,14 @@ __global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT } template -void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A, +void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT* d_A, int lda, KeysIteratorT d_keys, int nrows, int ncols, int key_offset, int nkeys, - DataIteratorT d_sums, + DataIteratorT* d_sums, cudaStream_t st) { dim3 grid, block; @@ -264,7 +264,7 @@ void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A, //#define RRBK_SHMEM template -__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A, +__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT* d_A, int lda, const WeightT* d_weights, KeysIteratorT d_keys, @@ -272,10 +272,10 @@ __global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT int ncols, int key_offset, int nkeys, - DataIteratorT d_sums) + DataIteratorT* d_sums) { typedef typename std::iterator_traits::value_type KeyType; - typedef typename std::iterator_traits::value_type DataType; + typedef typename std::iterator_traits::value_type DataType; #ifdef RRBK_SHMEM __shared__ KeyType sh_keys[RRBK_SHMEM_SZ]; @@ -320,15 +320,15 @@ __global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT } template -void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A, +void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A, int lda, - const KeysIteratorT d_keys, + KeysIteratorT d_keys, const WeightT* d_weights, int nrows, int ncols, int key_offset, int nkeys, - DataIteratorT d_sums, + DataIteratorT* d_sums, cudaStream_t st) { // x-dim refers to the column in the input data @@ -367,19 +367,19 @@ void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A, * @param[in] stream CUDA stream */ template -void reduce_rows_by_key(const DataIteratorT d_A, +void reduce_rows_by_key(const DataIteratorT* d_A, int lda, - const KeysIteratorT d_keys, + KeysIteratorT d_keys, const WeightT* d_weights, char* d_keys_char, int nrows, int ncols, int nkeys, - DataIteratorT d_sums, + DataIteratorT* d_sums, cudaStream_t stream) { typedef typename std::iterator_traits::value_type KeyType; - typedef typename std::iterator_traits::value_type DataType; + typedef typename std::iterator_traits::value_type DataType; // Following kernel needs memset cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream); @@ -418,17 +418,17 @@ void reduce_rows_by_key(const DataIteratorT d_A, * @param[in] stream CUDA stream */ template -void reduce_rows_by_key(const DataIteratorT d_A, +void reduce_rows_by_key(const DataIteratorT* d_A, int lda, - const KeysIteratorT d_keys, + KeysIteratorT d_keys, char* d_keys_char, int nrows, int ncols, int nkeys, - DataIteratorT d_sums, + DataIteratorT* d_sums, cudaStream_t stream) { - typedef typename std::iterator_traits::value_type DataType; + typedef typename std::iterator_traits::value_type DataType; reduce_rows_by_key(d_A, lda, d_keys, diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh index 5487aead19..f96598d9e6 100644 --- a/cpp/include/raft/linalg/detail/rsvd.cuh +++ b/cpp/include/raft/linalg/detail/rsvd.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include @@ -25,6 +24,7 @@ #include #include #include +#include #include diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index f7af9e88d6..d72bd54a32 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -18,8 +18,8 @@ #include "unary_op.cuh" #include -#include #include +#include #include namespace raft { diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh index 084c6d2fd3..ae0f09d2fe 100644 --- a/cpp/include/raft/linalg/detail/subtract.cuh +++ b/cpp/include/raft/linalg/detail/subtract.cuh @@ -16,9 +16,9 @@ #pragma once -#include #include #include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh index aa33dcb0a9..90a7ddec1f 100644 --- a/cpp/include/raft/linalg/detail/svd.cuh +++ b/cpp/include/raft/linalg/detail/svd.cuh @@ -23,11 +23,11 @@ #include #include -#include -#include -#include +#include #include #include +#include +#include #include #include @@ -66,15 +66,9 @@ void svdQR(const raft::handle_t& handle, char jobu = 'S'; char jobvt = 'A'; - if (!gen_left_vec) { - char new_u = 'N'; - strcpy(&jobu, &new_u); - } + if (!gen_left_vec) { jobu = 'N'; } - if (!gen_right_vec) { - char new_vt = 'N'; - strcpy(&jobvt, &new_vt); - } + if (!gen_right_vec) { jobvt = 'N'; } RAFT_CUSOLVER_TRY(cusolverDngesvd(cusolverH, jobu, diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh index bcfcc9df01..7874f20f56 100644 --- a/cpp/include/raft/linalg/detail/ternary_op.cuh +++ b/cpp/include/raft/linalg/detail/ternary_op.cuh @@ -16,15 +16,15 @@ #pragma once -#include -#include +#include +#include namespace raft { namespace linalg { namespace detail { -template +template __global__ void ternaryOpKernel( - math_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op) + out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op) { typedef raft::TxN_t VecType; VecType a, b, c; @@ -41,8 +41,8 @@ __global__ void ternaryOpKernel( a.store(out, idx); } -template -void ternaryOpImpl(math_t* out, +template +void ternaryOpImpl(out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, @@ -51,7 +51,7 @@ void ternaryOpImpl(math_t* out, cudaStream_t stream) { const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB); - ternaryOpKernel + ternaryOpKernel <<>>(out, in1, in2, in3, len, op); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -70,8 +70,8 @@ void ternaryOpImpl(math_t* out, * @param op the device-lambda * @param stream cuda stream where to launch work */ -template -void ternaryOp(math_t* out, +template +void ternaryOp(out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, @@ -81,22 +81,22 @@ void ternaryOp(math_t* out, { size_t bytes = len * sizeof(math_t); if (16 / sizeof(math_t) && bytes % 16 == 0) { - ternaryOpImpl( + ternaryOpImpl( out, in1, in2, in3, len, op, stream); } else if (8 / sizeof(math_t) && bytes % 8 == 0) { - ternaryOpImpl( + ternaryOpImpl( out, in1, in2, in3, len, op, stream); } else if (4 / sizeof(math_t) && bytes % 4 == 0) { - ternaryOpImpl( + ternaryOpImpl( out, in1, in2, in3, len, op, stream); } else if (2 / sizeof(math_t) && bytes % 2 == 0) { - ternaryOpImpl( + ternaryOpImpl( out, in1, in2, in3, len, op, stream); } else if (1 / sizeof(math_t)) { - ternaryOpImpl( + ternaryOpImpl( out, in1, in2, in3, len, op, stream); } else { - ternaryOpImpl(out, in1, in2, in3, len, op, stream); + ternaryOpImpl(out, in1, in2, in3, len, op, stream); } } diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh index 242d3a3912..ef5551ea7e 100644 --- a/cpp/include/raft/linalg/detail/transpose.cuh +++ b/cpp/include/raft/linalg/detail/transpose.cuh @@ -18,8 +18,8 @@ #include "cublas_wrappers.hpp" -#include -#include +#include +#include #include #include #include diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh index 9ddfe79657..cdadc6f868 100644 --- a/cpp/include/raft/linalg/detail/unary_op.cuh +++ b/cpp/include/raft/linalg/detail/unary_op.cuh @@ -16,9 +16,9 @@ #pragma once -#include -#include -#include +#include +#include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh index 820c42f0ea..53b083045e 100644 --- a/cpp/include/raft/linalg/divide.cuh +++ b/cpp/include/raft/linalg/divide.cuh @@ -20,6 +20,9 @@ #include "detail/divide.cuh" +#include +#include + namespace raft { namespace linalg { @@ -27,7 +30,8 @@ using detail::divides_scalar; /** * @defgroup ScalarOps Scalar operations on the input buffer - * @tparam math_t data-type upon which the math operation will be performed + * @tparam OutT output data-type upon which the math operation will be performed + * @tparam InT input data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param out the output buffer * @param in the input buffer @@ -36,13 +40,62 @@ using detail::divides_scalar; * @param stream cuda stream where to launch work * @{ */ -template -void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +template +void divideScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) { detail::divideScalar(out, in, scalar, len, stream); } /** @} */ +/** + * @defgroup divide Division Arithmetic + * @{ + */ + +/** + * @brief Elementwise division of input by host scalar + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @tparam ScalarIdxType Index Type of scalar + * @param[in] handle raft::handle_t + * @param[in] in Input + * @param[in] scalar raft::host_scalar_view + * @param[out] out Output + */ +template , + typename = raft::enable_if_output_device_mdspan> +void divide_scalar(const raft::handle_t& handle, + InType in, + OutType out, + raft::host_scalar_view scalar) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + if (out.size() <= std::numeric_limits::max()) { + divideScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + divideScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** @} */ // end of group add + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp index 8d1bd37186..57f4376fcc 100644 --- a/cpp/include/raft/linalg/divide.hpp +++ b/cpp/include/raft/linalg/divide.hpp @@ -18,37 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __DIVIDE_H -#define __DIVIDE_H - -#pragma once - -#include "detail/divide.cuh" - -namespace raft { -namespace linalg { - -using detail::divides_scalar; - /** - * @defgroup ScalarOps Scalar operations on the input buffer - * @tparam math_t data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing - * @param out the output buffer - * @param in the input buffer - * @param scalar the scalar used in the operations - * @param len number of elements in the input buffer - * @param stream cuda stream where to launch work - * @{ + * DISCLAIMER: this file is deprecated: use divide.cuh instead */ -template -void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) -{ - detail::divideScalar(out, in, scalar, len, stream); -} -/** @} */ -}; // end namespace linalg -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "divide.cuh" diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index f1f02dc13e..2ad222d42d 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -20,6 +20,8 @@ #include "detail/eig.cuh" +#include + namespace raft { namespace linalg { @@ -73,9 +75,9 @@ using detail::OVERWRITE_INPUT; template void eigSelDC(const raft::handle_t& handle, math_t* in, - int n_rows, - int n_cols, - int n_eig_vals, + std::size_t n_rows, + std::size_t n_cols, + std::size_t n_eig_vals, math_t* eig_vectors, math_t* eig_vals, EigVecMemUsage memUsage, @@ -102,8 +104,8 @@ void eigSelDC(const raft::handle_t& handle, template void eigJacobi(const raft::handle_t& handle, const math_t* in, - int n_rows, - int n_cols, + std::size_t n_rows, + std::size_t n_cols, math_t* eig_vectors, math_t* eig_vals, cudaStream_t stream, @@ -112,6 +114,109 @@ void eigJacobi(const raft::handle_t& handle, { detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps); } + +/** + * @brief eig decomp with divide and conquer method for the column-major + * symmetric matrices + * @tparam ValueType the data-type of input and output + * @tparam IntegerType Integer used for addressing + * @param handle raft::handle_t + * @param[in] in input raft::device_matrix_view (symmetric matrix that has real eig values and + * vectors) + * @param[out] eig_vectors: eigenvectors output of type raft::device_matrix_view + * @param[out] eig_vals: eigen values output of type raft::device_vector_view + */ +template +void eig_dc(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view eig_vectors, + raft::device_vector_view eig_vals) +{ + RAFT_EXPECTS(in.size() == eig_vectors.size(), "Size mismatch between Input and Eigen Vectors"); + RAFT_EXPECTS(eig_vals.size() == in.extent(1), "Size mismatch between Input and Eigen Values"); + + eigDC(handle, + in.data_handle(), + in.extent(0), + in.extent(1), + eig_vectors.data_handle(), + eig_vals.data_handle(), + handle.get_stream()); +} + +/** + * @brief eig decomp to select top-n eigen values with divide and conquer method + * for the column-major symmetric matrices + * @tparam ValueType the data-type of input and output + * @tparam IntegerType Integer used for addressing + * @param[in] handle raft::handle_t + * @param[in] in input raft::device_matrix_view (symmetric matrix that has real eig values and + * vectors) + * @param[out] eig_vectors: eigenvectors output of type raft::device_matrix_view + * @param[out] eig_vals: eigen values output of type raft::device_vector_view + * @param[in] n_eig_vals: number of eigenvectors to be generated + * @param[in] memUsage: the memory selection for eig vector output + */ +template +void eig_dc_selective(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view eig_vectors, + raft::device_vector_view eig_vals, + std::size_t n_eig_vals, + EigVecMemUsage memUsage) +{ + RAFT_EXPECTS(eig_vectors.size() == n_eig_vals * in.extent(0), + "Size mismatch between Input and Eigen Vectors"); + RAFT_EXPECTS(eig_vals.size() == n_eig_vals, "Size mismatch between Input and Eigen Values"); + + raft::linalg::eigSelDC(handle, + const_cast(in.data_handle()), + in.extent(0), + in.extent(1), + n_eig_vals, + eig_vectors.data_handle(), + eig_vals.data_handle(), + memUsage, + handle.get_stream()); +} + +/** + * @brief overloaded function for eig decomp with Jacobi method for the + * column-major symmetric matrices (in parameter) + * @tparam ValueType the data-type of input and output + * @tparam IntegerType Integer used for addressing + * @param handle raft::handle_t + * @param[in] in input raft::device_matrix_view (symmetric matrix that has real eig values and + * vectors) + * @param[out] eig_vectors: eigenvectors output of type raft::device_matrix_view + * @param[out] eig_vals: eigen values output of type raft::device_vector_view + * @param[in] tol: error tolerance for the jacobi method. Algorithm stops when the + Frobenius norm of the absolute error is below tol + * @param[in] sweeps: number of sweeps in the Jacobi algorithm. The more the better + * accuracy. + */ +template +void eig_jacobi(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view eig_vectors, + raft::device_vector_view eig_vals, + ValueType tol = 1.e-7, + int sweeps = 15) +{ + RAFT_EXPECTS(in.size() == eig_vectors.size(), "Size mismatch between Input and Eigen Vectors"); + RAFT_EXPECTS(eig_vals.size() == in.extent(1), "Size mismatch between Input and Eigen Values"); + + eigJacobi(handle, + in.data_handle(), + in.extent(0), + in.extent(1), + eig_vectors.data_handle(), + eig_vals.data_handle(), + handle.get_stream(), + tol, + sweeps); +} + /** @} */ // end of eig }; // end namespace linalg diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp index 032c4e97f9..175a2aaccc 100644 --- a/cpp/include/raft/linalg/eig.hpp +++ b/cpp/include/raft/linalg/eig.hpp @@ -18,108 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __EIG_H -#define __EIG_H - -#pragma once - -#include "detail/eig.cuh" - -namespace raft { -namespace linalg { - -/** - * @defgroup eig Eigen Decomposition Methods - * @{ - */ - /** - * @brief eig decomp with divide and conquer method for the column-major - * symmetric matrices - * @param handle raft handle - * @param in the input buffer (symmetric matrix that has real eig values and - * vectors. - * @param n_rows: number of rows of the input - * @param n_cols: number of cols of the input - * @param eig_vectors: eigenvectors - * @param eig_vals: eigen values - * @param stream cuda stream + * DISCLAIMER: this file is deprecated: use eig.cuh instead */ -template -void eigDC(const raft::handle_t& handle, - const math_t* in, - std::size_t n_rows, - std::size_t n_cols, - math_t* eig_vectors, - math_t* eig_vals, - cudaStream_t stream) -{ - detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); -} -using detail::COPY_INPUT; -using detail::EigVecMemUsage; -using detail::OVERWRITE_INPUT; - -/** - * @brief eig sel decomp with divide and conquer method for the column-major - * symmetric matrices - * @param handle raft handle - * @param in the input buffer (symmetric matrix that has real eig values and - * vectors. - * @param n_rows: number of rows of the input - * @param n_cols: number of cols of the input - * @param n_eig_vals: number of eigenvectors to be generated - * @param eig_vectors: eigenvectors - * @param eig_vals: eigen values - * @param memUsage: the memory selection for eig vector output - * @param stream cuda stream - */ -template -void eigSelDC(const raft::handle_t& handle, - math_t* in, - int n_rows, - int n_cols, - int n_eig_vals, - math_t* eig_vectors, - math_t* eig_vals, - EigVecMemUsage memUsage, - cudaStream_t stream) -{ - detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream); -} - -/** - * @brief overloaded function for eig decomp with Jacobi method for the - * column-major symmetric matrices (in parameter) - * @param handle: raft handle - * @param in: input matrix - * @param n_rows: number of rows of the input - * @param n_cols: number of cols of the input - * @param eig_vectors: eigenvectors - * @param eig_vals: eigen values - * @param stream: stream on which this function will be run - * @param tol: error tolerance for the jacobi method. Algorithm stops when the - * error is below tol - * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better - * accuracy. - */ -template -void eigJacobi(const raft::handle_t& handle, - const math_t* in, - int n_rows, - int n_cols, - math_t* eig_vectors, - math_t* eig_vals, - cudaStream_t stream, - math_t tol = 1.e-7, - int sweeps = 15) -{ - detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps); -} -/** @} */ // end of eig +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "eig.cuh" diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp index 62624f6eeb..8931c88241 100644 --- a/cpp/include/raft/linalg/eltwise.hpp +++ b/cpp/include/raft/linalg/eltwise.hpp @@ -18,94 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __ELTWISE_H -#define __ELTWISE_H - -#pragma once - -#include "detail/eltwise.cuh" - -namespace raft { -namespace linalg { - -using detail::adds_scalar; - -/** - * @defgroup ScalarOps Scalar operations on the input buffer - * @tparam InType data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing - * @param out the output buffer - * @param in the input buffer - * @param scalar the scalar used in the operations - * @param len number of elements in the input buffer - * @param stream cuda stream where to launch work - * @{ - */ -template -void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) -{ - detail::scalarAdd(out, in, scalar, len, stream); -} - -using detail::multiplies_scalar; - -template -void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) -{ - detail::scalarMultiply(out, in, scalar, len, stream); -} -/** @} */ - /** - * @defgroup BinaryOps Element-wise binary operations on the input buffers - * @tparam InType data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing - * @param out the output buffer - * @param in1 the first input buffer - * @param in2 the second input buffer - * @param len number of elements in the input buffers - * @param stream cuda stream where to launch work - * @{ + * DISCLAIMER: this file is deprecated: use eltwise.cuh instead */ -template -void eltwiseAdd( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ - detail::eltwiseAdd(out, in1, in2, len, stream); -} - -template -void eltwiseSub( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ - detail::eltwiseSub(out, in1, in2, len, stream); -} -template -void eltwiseMultiply( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ - detail::eltwiseMultiply(out, in1, in2, len, stream); -} - -template -void eltwiseDivide( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ - detail::eltwiseDivide(out, in1, in2, len, stream); -} - -using detail::divides_check_zero; - -template -void eltwiseDivideCheckZero( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ - detail::eltwiseDivideCheckZero(out, in1, in2, len, stream); -} -/** @} */ +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "eltwise.cuh" diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index 16a5bc48ea..f2354da6c6 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -20,6 +20,12 @@ #include "detail/gemm.hpp" +#include +#include +#include +#include +#include + namespace raft { namespace linalg { @@ -145,7 +151,7 @@ void gemm(const raft::handle_t& handle, * @param x input matrix of size M rows x K columns * @param y input matrix of size K rows x N columns * @param _M number of rows of X and Z - * @param _N number of rows of Y and columns of Z + * @param _N number of columns of Y and columns of Z * @param _K number of columns of X and rows of Y * @param isZColMajor Storage layout of Z. true = col major, false = row major * @param isXColMajor Storage layout of X. true = col major, false = row major @@ -170,9 +176,102 @@ void gemm(const raft::handle_t& handle, T beta = T(0.0)) { detail::gemm( - handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta); + handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, &alpha, &beta); +} + +/** + * @defgroup gemm Matrix-Matrix Multiplication + * @{ + */ + +/** + * @brief GEMM function designed for handling all possible + * combinations of operand layouts (raft::row_major or raft::col_major) + * with scalars alpha and beta on the host or device + * It computes the following equation: Z = alpha . X * Y + beta . Z + * If alpha is not provided, it is assumed to be 1.0 + * If beta is not provided, it is assumed to be 0.0 + * @tparam ValueType Data type of input/output matrices (float/double) + * @tparam IndexType Type of index + * @tparam LayoutPolicyX layout of X + * @tparam LayoutPolicyY layout of Y + * @tparam LayoutPolicyZ layout of Z + * @param[in] handle raft handle + * @param[in] x input raft::device_matrix_view of size M rows x K columns + * @param[in] y input raft::device_matrix_view of size K rows x N columns + * @param[out] z output raft::device_matrix_view of size M rows x N columns + * @param[in] alpha optional raft::host_scalar_view or raft::device_scalar_view, default 1.0 + * @param[in] beta optional raft::host_scalar_view or raft::device_scalar_view, default 0.0 + */ +template , + typename = std::enable_if_t>, + std::is_same>>>> +void gemm(const raft::handle_t& handle, + raft::device_matrix_view x, + raft::device_matrix_view y, + raft::device_matrix_view z, + std::optional alpha = std::nullopt, + std::optional beta = std::nullopt) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(x), "X is not contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(y), "Y is not contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(z), "Z is not contiguous"); + + RAFT_EXPECTS(x.extent(0) == z.extent(0), "Number of rows of X and Z should be equal"); + RAFT_EXPECTS(y.extent(1) == z.extent(1), "Number of columns of Y and Z should be equal"); + RAFT_EXPECTS(x.extent(1) == y.extent(0), "Number of columns of X and rows of Y should be equal"); + + constexpr auto is_x_col_major = + std::is_same_v; + constexpr auto is_y_col_major = + std::is_same_v; + constexpr auto is_z_col_major = + std::is_same_v; + + constexpr auto device_mode = + std::is_same_v>; + + ValueType alpha_value = 1; + ValueType beta_value = 0; + + auto alpha_device = raft::make_device_scalar(handle, alpha_value); + auto beta_device = raft::make_device_scalar(handle, beta_value); + + auto alpha_host = raft::make_host_scalar(alpha_value); + auto beta_host = raft::make_host_scalar(beta_value); + + if constexpr (device_mode) { + if (!alpha) { alpha = alpha_device.view(); } + if (!beta) { beta = beta_device.view(); } + } else { + if (!alpha) { alpha = alpha_host.view(); } + if (!beta) { beta = beta_host.view(); } + } + + detail::gemm(handle, + z.data_handle(), + x.data_handle(), + y.data_handle(), + x.extent(0), + y.extent(1), + x.extent(1), + is_z_col_major, + is_x_col_major, + is_y_col_major, + handle.get_stream(), + alpha.value().data_handle(), + beta.value().data_handle()); } +/** @} */ // end of gemm + } // end namespace linalg } // end namespace raft diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp index 37c6b2d552..6ad2f1fbe1 100644 --- a/cpp/include/raft/linalg/gemm.hpp +++ b/cpp/include/raft/linalg/gemm.hpp @@ -18,167 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __GEMM_H -#define __GEMM_H - -#pragma once - -#include "detail/gemm.hpp" - -namespace raft { -namespace linalg { - /** - * @brief the wrapper of cublas gemm function - * It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C - * - * @tparam math_t the element type - * @tparam DevicePointerMode whether pointers alpha, beta point to device memory - * @param [in] handle raft handle - * @param [in] trans_a cublas transpose op for A - * @param [in] trans_b cublas transpose op for B - * @param [in] m number of rows of C - * @param [in] n number of columns of C - * @param [in] k number of rows of opB(B) / number of columns of opA(A) - * @param [in] alpha host or device scalar - * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k] - * @param [in] lda leading dimension of A - * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n] - * @param [in] ldb leading dimension of B - * @param [in] beta host or device scalar - * @param [inout] C column-major matrix of size [m, n] - * @param [in] ldc leading dimension of C - * @param [in] stream + * DISCLAIMER: this file is deprecated: use gemm.cuh instead */ -template -void gemm(const raft::handle_t& handle, - const bool trans_a, - const bool trans_b, - const int m, - const int n, - const int k, - const math_t* alpha, - const math_t* A, - const int lda, - const math_t* B, - const int ldb, - const math_t* beta, - math_t* C, - const int ldc, - cudaStream_t stream) -{ - detail::gemm( - handle, trans_a, trans_b, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, stream); -} -/** - * @brief the wrapper of cublas gemm function - * It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C - * @tparam math_t the type of input/output matrices - * @param handle raft handle - * @param a input matrix - * @param n_rows_a number of rows of A - * @param n_cols_a number of columns of A - * @param b input matrix - * @param c output matrix - * @param n_rows_c number of rows of C - * @param n_cols_c number of columns of C - * @param trans_a cublas transpose op for A - * @param trans_b cublas transpose op for B - * @param alpha scalar - * @param beta scalar - * @param stream cuda stream - */ -template -void gemm(const raft::handle_t& handle, - const math_t* a, - int n_rows_a, - int n_cols_a, - const math_t* b, - math_t* c, - int n_rows_c, - int n_cols_c, - cublasOperation_t trans_a, - cublasOperation_t trans_b, - math_t alpha, - math_t beta, - cudaStream_t stream) -{ - detail::gemm( - handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); -} - -/** - * @brief the wrapper of cublas gemm function - * It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C - * @tparam math_t the type of input/output matrices - * @param handle raft handle - * @param a input matrix - * @param n_rows_a number of rows of A - * @param n_cols_a number of columns of A - * @param b input matrix - * @param c output matrix - * @param n_rows_c number of rows of C - * @param n_cols_c number of columns of C - * @param trans_a cublas transpose op for A - * @param trans_b cublas transpose op for B - * @param stream cuda stream - */ -template -void gemm(const raft::handle_t& handle, - const math_t* a, - int n_rows_a, - int n_cols_a, - const math_t* b, - math_t* c, - int n_rows_c, - int n_cols_c, - cublasOperation_t trans_a, - cublasOperation_t trans_b, - cudaStream_t stream) -{ - detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream); -} - -/** - * @brief A wrapper for CUBLS GEMM function designed for handling all possible - * combinations of operand layouts. - * It computes the following equation: Z = alpha . X * Y + beta . Z - * @tparam T Data type of input/output matrices (float/double) - * @param handle raft handle - * @param z output matrix of size M rows x N columns - * @param x input matrix of size M rows x K columns - * @param y input matrix of size K rows x N columns - * @param _M number of rows of X and Z - * @param _N number of rows of Y and columns of Z - * @param _K number of columns of X and rows of Y - * @param isZColMajor Storage layout of Z. true = col major, false = row major - * @param isXColMajor Storage layout of X. true = col major, false = row major - * @param isYColMajor Storage layout of Y. true = col major, false = row major - * @param stream cuda stream - * @param alpha scalar - * @param beta scalar - */ -template -void gemm(const raft::handle_t& handle, - T* z, - T* x, - T* y, - int _M, - int _N, - int _K, - bool isZColMajor, - bool isXColMajor, - bool isYColMajor, - cudaStream_t stream, - T alpha = T(1.0), - T beta = T(0.0)) -{ - detail::gemm( - handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta); -} +#pragma once -} // end namespace linalg -} // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif +#include "gemm.cuh" diff --git a/cpp/include/raft/linalg/gemv.cuh b/cpp/include/raft/linalg/gemv.cuh index 26a6386148..8132a742f8 100644 --- a/cpp/include/raft/linalg/gemv.cuh +++ b/cpp/include/raft/linalg/gemv.cuh @@ -20,6 +20,12 @@ #include "detail/gemv.hpp" +#include +#include +#include +#include +#include + namespace raft { namespace linalg { @@ -206,6 +212,98 @@ void gemv(const raft::handle_t& handle, detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream); } +/** + * @defgroup gemv Matrix-Vector Multiplication + * @{ + */ + +/** + * @brief GEMV function designed for raft::col_major layout for A + * It computes y = alpha * op(A) * x + beta * y, where length of y is number + * of rows in A while length of x is number of columns in A + * If layout for A is provided as raft::row_major, then a transpose of A + * is used in the computation, where length of y is number of columns in A + * while length of x is number of rows in A + * If alpha is not provided, it is assumed to be 1.0 + * If beta is not provided, it is assumed to be 0.0 + * @tparam ValueType Data type of input/output matrices (float/double) + * @tparam IndexType Type of index + * @tparam LayoutPolicyX layout of X + * @tparam LayoutPolicyY layout of Y + * @tparam LayoutPolicyZ layout of Z + * @param[in] handle raft handle + * @param[in] A input raft::device_matrix_view of size (M, N) + * @param[in] x input raft::device_matrix_view of size (N, 1) if A is raft::col_major, else (M, 1) + * @param[out] y output raft::device_matrix_view of size (M, 1) if A is raft::col_major, else (N, 1) + * @param[in] alpha optional raft::host_scalar_view or raft::device_scalar_view, default 1.0 + * @param[in] beta optional raft::host_scalar_view or raft::device_scalar_view, default 0.0 + */ +template , + typename = std::enable_if_t>, + std::is_same>>>> +void gemv(const raft::handle_t& handle, + raft::device_matrix_view A, + raft::device_vector_view x, + raft::device_vector_view y, + std::optional alpha = std::nullopt, + std::optional beta = std::nullopt) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(A), "A is not contiguous"); + + constexpr auto is_A_col_major = + std::is_same_v; + + if (is_A_col_major) { + RAFT_EXPECTS(x.extent(0) == A.extent(1), + "Number of columns of A and length of x should be equal"); + RAFT_EXPECTS(y.extent(0) == A.extent(0), "Number of rows of A and length of y should be equal"); + } else { + RAFT_EXPECTS(x.extent(0) == A.extent(0), "Number of rows of A and length of x should be equal"); + RAFT_EXPECTS(y.extent(0) == A.extent(1), + "Number of columns of A and length of y should be equal"); + } + + constexpr auto device_mode = + std::is_same_v>; + + ValueType alpha_value = 1; + ValueType beta_value = 0; + + auto alpha_device = raft::make_device_scalar(handle, alpha_value); + auto beta_device = raft::make_device_scalar(handle, beta_value); + + auto alpha_host = raft::make_host_scalar(alpha_value); + auto beta_host = raft::make_host_scalar(beta_value); + + if constexpr (device_mode) { + if (!alpha) { alpha = alpha_device.view(); } + if (!beta) { beta = beta_device.view(); } + } else { + if (!alpha) { alpha = alpha_host.view(); } + if (!beta) { beta = beta_host.view(); } + } + + gemv(handle, + !is_A_col_major, + A.extent(0), + A.extent(1), + alpha.value().data_handle(), + A.data_handle(), + A.extent(0), + x.data_handle(), + 1, + beta.value().data_handle(), + y.data_handle(), + 1, + handle.get_stream()); +} +/** @} */ // end of gemv + }; // namespace linalg }; // namespace raft #endif \ No newline at end of file diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp index 3b6b60263b..8161631fd3 100644 --- a/cpp/include/raft/linalg/gemv.hpp +++ b/cpp/include/raft/linalg/gemv.hpp @@ -18,200 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __GEMV_H -#define __GEMV_H - -#pragma once - -#include "detail/gemv.hpp" - -namespace raft { -namespace linalg { - -/** - * @brief the wrapper of cublas gemv function - * It computes the following equation: y = alpha .* op(A) * x + beta .* y - * - * @tparam math_t the element type - * @tparam DevicePointerMode whether pointers alpha, beta point to device memory - * @param [in] handle raft handle - * @param [in] trans_a cublas transpose op for A - * @param [in] m number of rows of A - * @param [in] n number of columns of A - * @param [in] alpha host or device scalar - * @param [in] A column-major matrix of size [m, n] - * @param [in] lda leading dimension of A - * @param [in] x vector of length n if trans_a else m - * @param [in] incx stride between consecutive elements of x - * @param [in] beta host or device scalar - * @param [inout] y vector of length m if trans_a else n - * @param [in] incy stride between consecutive elements of y - * @param [in] stream - */ -template -void gemv(const raft::handle_t& handle, - const bool trans_a, - const int m, - const int n, - const math_t* alpha, - const math_t* A, - const int lda, - const math_t* x, - const int incx, - const math_t* beta, - math_t* y, - const int incy, - cudaStream_t stream) -{ - detail::gemv( - handle, trans_a, m, n, alpha, A, lda, x, incx, beta, y, incy, stream); -} - -template -void gemv(const raft::handle_t& handle, - const math_t* A, - const int n_rows, - const int n_cols, - const math_t* x, - const int incx, - math_t* y, - const int incy, - const bool trans_a, - const math_t alpha, - const math_t beta, - cudaStream_t stream) -{ - detail::gemv(handle, A, n_rows, n_cols, x, incx, y, incy, trans_a, alpha, beta, stream); -} - -/** - * y = alpha * op(A) * x + beta * y - * - * where - * - * @param handle raft handle - * @param A is a column-major matrix of size n_rows_a * n_cols_a. - * op(A) is either the transpose operation (trans_a == true) or identity. - * @param n_rows_a number of rows in A - * @param n_cols_a number of cols in A - * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. - * @param trans_a whether to take transpose of a - * @param alpha is a scalar scale of Ax. - * @param beta is a scalar scale of y. - * @param stream stream on which this function is run - */ -template -void gemv(const raft::handle_t& handle, - const math_t* A, - const int n_rows_a, - const int n_cols_a, - const math_t* x, - math_t* y, - const bool trans_a, - const math_t alpha, - const math_t beta, - cudaStream_t stream) -{ - detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream); -} - /** - * y = op(A) * x - * - * where - * - * @param handle raft handle - * @param A is a column-major matrix of size n_rows_a * n_cols_a. - * op(A) is either the transpose operation (trans_a == true) or identity. - * @param n_rows_a number of rows in A - * @param n_cols_a number of cols in A - * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. - * @param trans_a whether to take transpose of a - * @param stream stream on which this function is run + * DISCLAIMER: this file is deprecated: use gemv.cuh instead */ -template -void gemv(const raft::handle_t& handle, - const math_t* A, - const int n_rows_a, - const int n_cols_a, - const math_t* x, - math_t* y, - const bool trans_a, - cudaStream_t stream) -{ - detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream); -} -/** - * y = alpha * op(A) * x + beta * y - * - * where - * @param handle raft handle - * @param A is a column-major matrix of size n_rows_a * n_cols_a. - * op(A) is either the transpose operation (trans_a == true) or identity. - * @param n_rows_a number of rows in A - * @param n_cols_a number of cols in A - * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. - * set it when you need to use only the first n_rows_a rows of the matrix A, which has - * (perhaps, due to padding) lda rows. - * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. - * @param trans_a whether to take transpose of a - * @param alpha is a scalar scale of Ax. - * @param beta is a scalar scale of y. - * @param stream stream on which this function is run - */ -template -void gemv(const raft::handle_t& handle, - const math_t* A, - const int n_rows_a, - const int n_cols_a, - const int lda, - const math_t* x, - math_t* y, - const bool trans_a, - const math_t alpha, - const math_t beta, - cudaStream_t stream) -{ - detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); -} - -/** - * y = op(A) * x - * - * where - * @param handle raft handle - * @param A is a column-major matrix of size n_rows_a * n_cols_a. - * op(A) is either the transpose operation (trans_a == true) or identity. - * @param n_rows_a number of rows in A - * @param n_cols_a number of cols in A - * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. - * set it when you need to use only the first n_rows_a rows of the matrix A, which has - * (perhaps, due to padding) lda rows. - * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. - * @param trans_a whether to take transpose of a - * @param stream stream on which this function is run - * - */ -template -void gemv(const raft::handle_t& handle, - const math_t* A, - const int n_rows_a, - const int n_cols_a, - const int lda, - const math_t* x, - math_t* y, - const bool trans_a, - cudaStream_t stream) -{ - detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream); -} +#pragma once -}; // namespace linalg -}; // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "gemv.cuh" diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp index db7b0f9cfe..9c59c886c9 100644 --- a/cpp/include/raft/linalg/init.hpp +++ b/cpp/include/raft/linalg/init.hpp @@ -18,48 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __INIT_H -#define __INIT_H - -#pragma once - -#include "detail/init.hpp" - -namespace raft { -namespace linalg { - /** - * @brief Like Python range. - * - * Fills the output as out[i] = i. - * - * \param [out] out device array, size [end-start] - * \param [in] start of the range - * \param [in] end of range (exclusive) - * \param [in] stream cuda stream + * DISCLAIMER: this file is deprecated: use init.cuh instead */ -template -void range(T* out, int start, int end, cudaStream_t stream) -{ - detail::range(out, start, end, stream); -} -/** - * @brief Like Python range. - * - * Fills the output as out[i] = i. - * - * \param [out] out device array, size [n] - * \param [in] n length of the array - * \param [in] stream cuda stream - */ -template -void range(T* out, int n, cudaStream_t stream) -{ - detail::range(out, n, stream); -} +#pragma once -} // namespace linalg -} // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "init.cuh" diff --git a/cpp/include/raft/linalg/lanczos.cuh b/cpp/include/raft/linalg/lanczos.cuh index a7157adfab..c9f3e0010e 100644 --- a/cpp/include/raft/linalg/lanczos.cuh +++ b/cpp/include/raft/linalg/lanczos.cuh @@ -13,150 +13,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef __LANCZOS_H -#define __LANCZOS_H - -#pragma once - -#include "detail/lanczos.cuh" -#include - -namespace raft { -namespace linalg { - -// ========================================================= -// Eigensolver -// ========================================================= - /** - * @brief Compute smallest eigenvectors of symmetric matrix - * Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the smallest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th smallest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param iter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Smallest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to smallest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @param seed random seed. - * @return error flag. + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -template -int computeSmallestEigenvectors( - handle_t const& handle, - spectral::matrix::sparse_matrix_t const& A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t& iter, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed = 1234567) -{ - return detail::computeSmallestEigenvectors(handle, - A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - iter, - eigVals_dev, - eigVecs_dev, - seed); -} /** - * @brief Compute largest eigenvectors of symmetric matrix - * Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the largest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th largest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param iter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Largest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to largest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @param seed random seed. - * @return error flag. + * DISCLAIMER: this file is deprecated: use lanczos.cuh instead */ -template -int computeLargestEigenvectors( - handle_t const& handle, - spectral::matrix::sparse_matrix_t const& A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t& iter, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed = 123456) -{ - return detail::computeLargestEigenvectors(handle, - A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - iter, - eigVals_dev, - eigVecs_dev, - seed); -} -} // namespace linalg -} // namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the sparse solvers version instead.") + +#include -#endif \ No newline at end of file +namespace raft::linalg { +using raft::sparse::solver::computeLargestEigenvectors; +using raft::sparse::solver::computeSmallestEigenvectors; +} // namespace raft::linalg diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 75e3d11444..2141e4e908 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -18,150 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __LANCZOS_H -#define __LANCZOS_H - -#pragma once - -#include "detail/lanczos.cuh" -#include - -namespace raft { -namespace linalg { - -// ========================================================= -// Eigensolver -// ========================================================= - /** - * @brief Compute smallest eigenvectors of symmetric matrix - * Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the smallest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th smallest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param iter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Smallest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to smallest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @param seed random seed. - * @return error flag. + * DISCLAIMER: this file is deprecated: use lanczos.cuh instead */ -template -int computeSmallestEigenvectors( - handle_t const& handle, - spectral::matrix::sparse_matrix_t const& A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t& iter, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed = 1234567) -{ - return detail::computeSmallestEigenvectors(handle, - A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - iter, - eigVals_dev, - eigVecs_dev, - seed); -} -/** - * @brief Compute largest eigenvectors of symmetric matrix - * Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the largest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th largest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param iter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Largest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to largest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @param seed random seed. - * @return error flag. - */ -template -int computeLargestEigenvectors( - handle_t const& handle, - spectral::matrix::sparse_matrix_t const& A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t& iter, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed = 123456) -{ - return detail::computeLargestEigenvectors(handle, - A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - iter, - eigVals_dev, - eigVecs_dev, - seed); -} +#pragma once -} // namespace linalg -} // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the sparse/solvers version instead.") -#endif \ No newline at end of file +#include diff --git a/cpp/include/raft/linalg/linalg_types.hpp b/cpp/include/raft/linalg/linalg_types.hpp new file mode 100644 index 0000000000..e50d3a8e79 --- /dev/null +++ b/cpp/include/raft/linalg/linalg_types.hpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft::linalg { + +/** + * @brief Enum for reduction/broadcast where an operation is to be performed along + * a matrix's rows or columns + * + */ +enum class Apply { ALONG_ROWS, ALONG_COLUMNS }; + +/** + * @brief Enum for reduction/broadcast where an operation is to be performed along + * a matrix's rows or columns + * + */ +enum class FillMode { UPPER, LOWER }; + +} // end namespace raft::linalg \ No newline at end of file diff --git a/cpp/include/raft/linalg/lstsq.cuh b/cpp/include/raft/linalg/lstsq.cuh index 255f1293f4..7654812886 100644 --- a/cpp/include/raft/linalg/lstsq.cuh +++ b/cpp/include/raft/linalg/lstsq.cuh @@ -18,7 +18,7 @@ #pragma once -#include +#include #include namespace raft { namespace linalg { @@ -115,6 +115,135 @@ void lstsqQR(const raft::handle_t& handle, detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream); } +/** + * @defgroup lstsq Least Squares Methods + * @{ + */ + +/** + * @brief Solves the linear ordinary least squares problem `Aw = b` + * Via SVD decomposition of `A = U S Vt`. + * + * @tparam ValueType the data-type of input/output + * @param[in] handle raft::handle_t + * @param[inout] A input raft::device_matrix_view + * Warning: the content of this matrix is modified. + * @param[inout] b input target raft::device_vector_view + * Warning: the content of this vector is modified. + * @param[out] w output coefficient raft::device_vector_view + */ +template +void lstsq_svd_qr(const raft::handle_t& handle, + raft::device_matrix_view A, + raft::device_vector_view b, + raft::device_vector_view w) +{ + RAFT_EXPECTS(A.extent(1) == w.size(), "Size mismatch between A and w"); + RAFT_EXPECTS(A.extent(0) == b.size(), "Size mismatch between A and b"); + + lstsqSvdQR(handle, + const_cast(A.data_handle()), + A.extent(0), + A.extent(1), + const_cast(b.data_handle()), + w.data_handle(), + handle.get_stream()); +} + +/** + * @brief Solves the linear ordinary least squares problem `Aw = b` + * Via SVD decomposition of `A = U S V^T` using Jacobi iterations. + * + * @tparam ValueType the data-type of input/output + * @param[in] handle raft::handle_t + * @param[inout] A input raft::device_matrix_view + * Warning: the content of this matrix is modified. + * @param[inout] b input target raft::device_vector_view + * Warning: the content of this vector is modified. + * @param[out] w output coefficient raft::device_vector_view + */ +template +void lstsq_svd_jacobi(const raft::handle_t& handle, + raft::device_matrix_view A, + raft::device_vector_view b, + raft::device_vector_view w) +{ + RAFT_EXPECTS(A.extent(1) == w.size(), "Size mismatch between A and w"); + RAFT_EXPECTS(A.extent(0) == b.size(), "Size mismatch between A and b"); + + lstsqSvdJacobi(handle, + const_cast(A.data_handle()), + A.extent(0), + A.extent(1), + const_cast(b.data_handle()), + w.data_handle(), + handle.get_stream()); +} + +/** + * @brief Solves the linear ordinary least squares problem `Aw = b` + * via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A). + * (`w = (A^T A)^-1 A^T b`) + * + * @tparam ValueType the data-type of input/output + * @param[in] handle raft::handle_t + * @param[inout] A input raft::device_matrix_view + * Warning: the content of this matrix is modified by the cuSOLVER routines. + * @param[inout] b input target raft::device_vector_view + * Warning: the content of this vector is modified by the cuSOLVER routines. + * @param[out] w output coefficient raft::device_vector_view + */ +template +void lstsq_eig(const raft::handle_t& handle, + raft::device_matrix_view A, + raft::device_vector_view b, + raft::device_vector_view w) +{ + RAFT_EXPECTS(A.extent(1) == w.size(), "Size mismatch between A and w"); + RAFT_EXPECTS(A.extent(0) == b.size(), "Size mismatch between A and b"); + + lstsqEig(handle, + const_cast(A.data_handle()), + A.extent(0), + A.extent(1), + const_cast(b.data_handle()), + w.data_handle(), + handle.get_stream()); +} + +/** + * @brief Solves the linear ordinary least squares problem `Aw = b` + * via QR decomposition of `A = QR`. + * (triangular system of equations `Rw = Q^T b`) + * + * @tparam ValueType the data-type of input/output + * @param[in] handle raft::handle_t + * @param[inout] A input raft::device_matrix_view + * Warning: the content of this matrix is modified. + * @param[inout] b input target raft::device_vector_view + * Warning: the content of this vector is modified. + * @param[out] w output coefficient raft::device_vector_view + */ +template +void lstsq_qr(const raft::handle_t& handle, + raft::device_matrix_view A, + raft::device_vector_view b, + raft::device_vector_view w) +{ + RAFT_EXPECTS(A.extent(1) == w.size(), "Size mismatch between A and w"); + RAFT_EXPECTS(A.extent(0) == b.size(), "Size mismatch between A and b"); + + lstsqQR(handle, + const_cast(A.data_handle()), + A.extent(0), + A.extent(1), + const_cast(b.data_handle()), + w.data_handle(), + handle.get_stream()); +} + +/** @} */ // end of lstsq + }; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp index f90cd00ea3..3dfbea0629 100644 --- a/cpp/include/raft/linalg/lstsq.hpp +++ b/cpp/include/raft/linalg/lstsq.hpp @@ -18,109 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __LSTSQ_H -#define __LSTSQ_H - -#pragma once - -#include -#include -namespace raft { -namespace linalg { - -/** Solves the linear ordinary least squares problem `Aw = b` - * Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine. - * - * @param[in] handle raft handle - * @param[inout] A input feature matrix. - * Warning: the content of this matrix is modified by the cuSOLVER routines. - * @param[in] n_rows number of rows in A - * @param[in] n_cols number of columns in A - * @param[inout] b input target vector. - * Warning: the content of this vector is modified by the cuSOLVER routines. - * @param[out] w output coefficient vector - * @param[in] stream cuda stream for ordering operations - */ -template -void lstsqSvdQR(const raft::handle_t& handle, - math_t* A, - const int n_rows, - const int n_cols, - const math_t* b, - math_t* w, - cudaStream_t stream) -{ - detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream); -} - -/** Solves the linear ordinary least squares problem `Aw = b` - * Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER). - * - * @param[in] handle raft handle - * @param[inout] A input feature matrix. - * Warning: the content of this matrix is modified by the cuSOLVER routines. - * @param[in] n_rows number of rows in A - * @param[in] n_cols number of columns in A - * @param[inout] b input target vector. - * Warning: the content of this vector is modified by the cuSOLVER routines. - * @param[out] w output coefficient vector - * @param[in] stream cuda stream for ordering operations - */ -template -void lstsqSvdJacobi(const raft::handle_t& handle, - math_t* A, - const int n_rows, - const int n_cols, - const math_t* b, - math_t* w, - cudaStream_t stream) -{ - detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream); -} - -/** Solves the linear ordinary least squares problem `Aw = b` - * via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A). - * (`w = (A^T A)^-1 A^T b`) +/** + * DISCLAIMER: this file is deprecated: use lstsq.cuh instead */ -template -void lstsqEig(const raft::handle_t& handle, - const math_t* A, - const int n_rows, - const int n_cols, - const math_t* b, - math_t* w, - cudaStream_t stream) -{ - detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream); -} -/** Solves the linear ordinary least squares problem `Aw = b` - * via QR decomposition of `A = QR`. - * (triangular system of equations `Rw = Q^T b`) - * - * @param[in] handle raft handle - * @param[inout] A input feature matrix. - * Warning: the content of this matrix is modified by the cuSOLVER routines. - * @param[in] n_rows number of rows in A - * @param[in] n_cols number of columns in A - * @param[inout] b input target vector. - * Warning: the content of this vector is modified by the cuSOLVER routines. - * @param[out] w output coefficient vector - * @param[in] stream cuda stream for ordering operations - */ -template -void lstsqQR(const raft::handle_t& handle, - math_t* A, - const int n_rows, - const int n_cols, - math_t* b, - math_t* w, - cudaStream_t stream) -{ - detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream); -} +#pragma once -}; // namespace linalg -}; // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "lstsq.cuh" diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index 5df4d24b4f..ad35cc5880 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -20,6 +20,9 @@ #include "detail/map.cuh" +#include +#include + namespace raft { namespace linalg { @@ -37,17 +40,64 @@ namespace linalg { * @param in the input array * @param args additional input arrays */ +template +void map_k( + OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + detail::mapImpl( + out, len, map, stream, in, args...); +} +/** + * @defgroup map Mapping ops + * @{ + */ + +/** + * @brief CUDA version of map + * @tparam InType data-type for math operation of type raft::device_mdspan + * @tparam MapOp the device-lambda performing the actual operation + * @tparam TPB threads-per-block in the final kernel launched + * @tparam OutType data-type of result of type raft::device_mdspan + * @tparam Args additional parameters + * @param[in] handle raft::handle_t + * @param[in] in the input of type raft::device_mdspan + * @param[out] out the output of the map operation of type raft::device_mdspan + * @param[in] map the device-lambda + * @param[in] args additional input arrays + */ template -void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) + typename = raft::enable_if_input_device_mdspan, + typename = raft::enable_if_output_device_mdspan> +void map(const raft::handle_t& handle, InType in, OutType out, MapOp map, Args... args) { - detail::mapImpl(out, len, map, stream, in, args...); + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output is not exhaustive"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input is not exhaustive"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Input and Output"); + + if (out.size() <= std::numeric_limits::max()) { + map_k( + out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...); + } else { + map_k( + out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...); + } } +/** @} */ // end of map + } // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/linalg/map_reduce.cuh b/cpp/include/raft/linalg/map_reduce.cuh new file mode 100644 index 0000000000..180ed128a1 --- /dev/null +++ b/cpp/include/raft/linalg/map_reduce.cuh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __MAP_REDUCE_H +#define __MAP_REDUCE_H + +#pragma once + +#include "detail/map_then_reduce.cuh" + +#include + +namespace raft::linalg { + +/** + * @defgroup map_reduce Map-Reduce ops + * @{ + */ + +/** + * @brief CUDA version of map and then generic reduction operation + * @tparam Type data-type upon which the math operation will be performed + * @tparam MapOp the device-lambda performing the actual map operation + * @tparam ReduceLambda the device-lambda performing the actual reduction + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Args additional parameters + * @param out the output reduced value (assumed to be a device pointer) + * @param len number of elements in the input array + * @param neutral The neutral element of the reduction operation. For example: + * 0 for sum, 1 for multiply, +Inf for Min, -Inf for Max + * @param map the device-lambda + * @param op the reduction device lambda + * @param stream cuda-stream where to launch this kernel + * @param in the input array + * @param args additional input arrays + */ + +template +void mapReduce(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ + detail::mapThenReduceImpl( + out, len, neutral, map, op, stream, in, args...); +} + +/** + * @brief CUDA version of map and then generic reduction operation + * @tparam InValueType the data-type of the input + * @tparam MapOp the device-lambda performing the actual map operation + * @tparam ReduceLambda the device-lambda performing the actual reduction + * @tparam IndexType the index type + * @tparam OutValueType the data-type of the output + * @tparam ScalarIdxType index type of scalar + * @tparam Args additional parameters + * @param[in] handle raft::handle_t + * @param[in] in the input of type raft::device_vector_view + * @param[in] neutral The neutral element of the reduction operation. For example: + * 0 for sum, 1 for multiply, +Inf for Min, -Inf for Max + * @param[out] out the output reduced value assumed to be a raft::device_scalar_view + * @param[in] map the fused device-lambda + * @param[in] op the fused reduction device lambda + * @param[in] args additional input arrays + */ +template +void map_reduce(const raft::handle_t& handle, + raft::device_vector_view in, + raft::device_scalar_view out, + OutValueType neutral, + MapOp map, + ReduceLambda op, + Args... args) +{ + mapReduce( + out.data_handle(), + in.extent(0), + neutral, + map, + op, + handle.get_stream(), + in.data_handle(), + args...); +} + +/** @} */ // end of map_reduce + +} // end namespace raft::linalg + +#endif \ No newline at end of file diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh index 36828cf154..a69ac6df36 100644 --- a/cpp/include/raft/linalg/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/map_then_reduce.cuh @@ -39,13 +39,14 @@ namespace linalg { template void mapThenSumReduce( - OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) + OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args) { - detail::mapThenReduceImpl( + detail::mapThenReduceImpl( out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...); } @@ -66,25 +67,27 @@ void mapThenSumReduce( * @param in the input array * @param args additional input arrays */ - template -void mapThenReduce(OutType* out, - size_t len, - OutType neutral, - MapOp map, - ReduceLambda op, - cudaStream_t stream, - const InType* in, - Args... args) +[[deprecated("Use function `mapReduce` from `raft/linalg/map_reduce.cuh")]] void mapThenReduce( + OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) { - detail::mapThenReduceImpl( + detail::mapThenReduceImpl( out, len, neutral, map, op, stream, in, args...); } + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp index 235485926b..6502a84edb 100644 --- a/cpp/include/raft/linalg/map_then_reduce.hpp +++ b/cpp/include/raft/linalg/map_then_reduce.hpp @@ -18,79 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __MAP_THEN_REDUCE_H -#define __MAP_THEN_REDUCE_H - -#pragma once - -#include "detail/map_then_reduce.cuh" - -namespace raft { -namespace linalg { - /** - * @brief CUDA version of map and then sum reduction operation - * @tparam Type data-type upon which the math operation will be performed - * @tparam MapOp the device-lambda performing the actual operation - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Args additional parameters - * @param out the output sum-reduced value (assumed to be a device pointer) - * @param len number of elements in the input array - * @param map the device-lambda - * @param stream cuda-stream where to launch this kernel - * @param in the input array - * @param args additional input arrays + * DISCLAIMER: this file is deprecated: use map_then_reduce.cuh instead */ -template -void mapThenSumReduce( - OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) -{ - detail::mapThenReduceImpl( - out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...); -} - -/** - * @brief CUDA version of map and then generic reduction operation - * @tparam Type data-type upon which the math operation will be performed - * @tparam MapOp the device-lambda performing the actual map operation - * @tparam ReduceLambda the device-lambda performing the actual reduction - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Args additional parameters - * @param out the output reduced value (assumed to be a device pointer) - * @param len number of elements in the input array - * @param neutral The neutral element of the reduction operation. For example: - * 0 for sum, 1 for multiply, +Inf for Min, -Inf for Max - * @param map the device-lambda - * @param op the reduction device lambda - * @param stream cuda-stream where to launch this kernel - * @param in the input array - * @param args additional input arrays - */ +#pragma once -template -void mapThenReduce(OutType* out, - size_t len, - OutType neutral, - MapOp map, - ReduceLambda op, - cudaStream_t stream, - const InType* in, - Args... args) -{ - detail::mapThenReduceImpl( - out, len, neutral, map, op, stream, in, args...); -} -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "map_then_reduce.cuh" diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 56437313e3..1438a09bd3 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -19,6 +19,10 @@ #pragma once #include "detail/matrix_vector_op.cuh" +#include "linalg_types.hpp" + +#include +#include namespace raft { namespace linalg { @@ -99,6 +103,142 @@ void matrixVectorOp(Type* out, detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); } +/** + * @defgroup matrix_vector_op Matrix Vector Operations + * @{ + */ + +/** + * @brief Operations for all the columns or rows with a given vector. + * Caution : Threads process multiple elements to speed up processing. These + * are loaded in a single read thanks to type promotion. Faster processing + * would thus only be enabled when adresses are optimally aligned for it. + * Note : the function will also check that the size of the window of accesses + * is a multiple of the number of elements processed by a thread in order to + * enable faster processing + * @tparam InValueType the data-type of the input matrices and vectors + * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major) + * @tparam Lambda a device function which represents a binary operator + * @tparam OutElementType the data-type of the output raft::matrix_view + * @tparam IndexType Integer used for addressing + * @tparam TPB threads per block of the cuda kernel launched + * @param[in] handle raft::handle_t + * @param[in] matrix input raft::matrix_view + * @param[in] vec vector raft::vector_view + * @param[out] out output raft::matrix_view + * @param[in] apply whether the broadcast of vector needs to happen along + * the rows of the matrix or columns using enum class raft::linalg::Apply + * @param[in] op the mathematical operation + */ +template +void matrix_vector_op(const raft::handle_t& handle, + raft::device_matrix_view matrix, + raft::device_vector_view vec, + raft::device_matrix_view out, + Apply apply, + Lambda op) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(matrix), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == matrix.size(), "Size mismatch between Output and Input"); + + auto constexpr rowMajor = std::is_same_v; + auto bcastAlongRows = apply == Apply::ALONG_ROWS; + + if (bcastAlongRows) { + RAFT_EXPECTS(out.extent(1) == static_cast(vec.size()), + "Size mismatch between matrix and vector"); + } else { + RAFT_EXPECTS(out.extent(0) == static_cast(vec.size()), + "Size mismatch between matrix and vector"); + } + + matrixVectorOp(out.data_handle(), + matrix.data_handle(), + vec.data_handle(), + out.extent(1), + out.extent(0), + rowMajor, + bcastAlongRows, + op, + handle.get_stream()); +} + +/** + * @brief Operations for all the columns or rows with the given vectors. + * Caution : Threads process multiple elements to speed up processing. These + * are loaded in a single read thanks to type promotion. Faster processing + * would thus only be enabled when adresses are optimally aligned for it. + * Note : the function will also check that the size of the window of accesses + * is a multiple of the number of elements processed by a thread in order to + * enable faster processing + * @tparam InValueType the data-type of the input matrices and vectors + * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major) + * @tparam Lambda a device function which represents a binary operator + * @tparam OutElementType the data-type of the output raft::matrix_view + * @tparam IndexType Integer used for addressing + * @tparam TPB threads per block of the cuda kernel launched + * @param handle raft::handle_t + * @param matrix input raft::matrix_view + * @param vec1 the first vector raft::vector_view + * @param vec2 the second vector raft::vector_view + * @param out output raft::matrix_view + * @param apply whether the broadcast of vector needs to happen along + * the rows of the matrix or columns using enum class raft::linalg::Apply + * @param op the mathematical operation + */ +template +void matrix_vector_op(const raft::handle_t& handle, + raft::device_matrix_view matrix, + raft::device_vector_view vec1, + raft::device_vector_view vec2, + raft::device_matrix_view out, + Apply apply, + Lambda op) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(matrix), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == matrix.size(), "Size mismatch between Output and Input"); + + auto constexpr rowMajor = std::is_same_v; + auto bcastAlongRows = apply == Apply::ALONG_ROWS; + + if (bcastAlongRows) { + RAFT_EXPECTS(out.extent(1) == static_cast(vec1.size()), + "Size mismatch between matrix and vector"); + RAFT_EXPECTS(out.extent(1) == static_cast(vec2.size()), + "Size mismatch between matrix and vector"); + } else { + RAFT_EXPECTS(out.extent(0) == static_cast(vec1.size()), + "Size mismatch between matrix and vector"); + RAFT_EXPECTS(out.extent(0) == static_cast(vec2.size()), + "Size mismatch between matrix and vector"); + } + + matrixVectorOp(out.data_handle(), + matrix.data_handle(), + vec1.data_handle(), + vec2.data_handle(), + out.extent(1), + out.extent(0), + rowMajor, + bcastAlongRows, + op, + handle.get_stream()); +} + +/** @} */ // end of group matrix_vector_op + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp index 574d4aee63..1237961ceb 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.hpp +++ b/cpp/include/raft/linalg/matrix_vector_op.hpp @@ -18,93 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __MATRIX_VECTOR_OP_H -#define __MATRIX_VECTOR_OP_H - -#pragma once - -#include "detail/matrix_vector_op.cuh" - -namespace raft { -namespace linalg { - /** - * @brief Operations for all the columns or rows with a given vector. - * Caution : Threads process multiple elements to speed up processing. These - * are loaded in a single read thanks to type promotion. Faster processing - * would thus only be enabled when adresses are optimally aligned for it. - * Note : the function will also check that the size of the window of accesses - * is a multiple of the number of elements processed by a thread in order to - * enable faster processing - * @tparam Type the matrix/vector type - * @tparam Lambda a device function which represents a binary operator - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads per block of the cuda kernel launched - * @param out the output matrix (passing out = matrix makes it in-place) - * @param matrix the input matrix - * @param vec the vector - * @param D number of columns of matrix - * @param N number of rows of matrix - * @param rowMajor whether input is row or col major - * @param bcastAlongRows whether the broadcast of vector needs to happen along - * the rows of the matrix or columns - * @param op the mathematical operation - * @param stream cuda stream where to launch work + * DISCLAIMER: this file is deprecated: use matrix_vector_op.cuh instead */ -template -void matrixVectorOp(Type* out, - const Type* matrix, - const Type* vec, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op, - cudaStream_t stream) -{ - detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); -} -/** - * @brief Operations for all the columns or rows with the given vectors. - * Caution : Threads process multiple elements to speed up processing. These - * are loaded in a single read thanks to type promotion. Faster processing - * would thus only be enabled when adresses are optimally aligned for it. - * Note : the function will also check that the size of the window of accesses - * is a multiple of the number of elements processed by a thread in order to - * enable faster processing - * @tparam Type the matrix/vector type - * @tparam Lambda a device function which represents a binary operator - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads per block of the cuda kernel launched - * @param out the output matrix (passing out = matrix makes it in-place) - * @param matrix the input matrix - * @param vec1 the first vector - * @param vec2 the second vector - * @param D number of columns of matrix - * @param N number of rows of matrix - * @param rowMajor whether input is row or col major - * @param bcastAlongRows whether the broadcast of vector needs to happen along - * the rows of the matrix or columns - * @param op the mathematical operation - * @param stream cuda stream where to launch work - */ -template -void matrixVectorOp(Type* out, - const Type* matrix, - const Type* vec1, - const Type* vec2, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op, - cudaStream_t stream) -{ - detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); -} +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "matrix_vector_op.cuh" diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh index 1b3297f926..582bab2acc 100644 --- a/cpp/include/raft/linalg/mean_squared_error.cuh +++ b/cpp/include/raft/linalg/mean_squared_error.cuh @@ -34,13 +34,45 @@ namespace linalg { * @param weight weight to apply to every term in the mean squared error calculation * @param stream cuda-stream where to launch this kernel */ -template +template void meanSquaredError( - math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) + out_t* out, const in_t* A, const in_t* B, size_t len, in_t weight, cudaStream_t stream) { detail::meanSquaredError(out, A, B, len, weight, stream); } +/** + * @defgroup mean_squared_error Mean Squared Error + * @{ + */ + +/** + * @brief CUDA version mean squared error function mean((A-B)**2) + * @tparam InValueType Input data-type + * @tparam IndexType Input/Output index type + * @tparam OutValueType Output data-type + * @tparam TPB threads-per-block + * @param[in] handle raft::handle_t + * @param[in] A input raft::device_vector_view + * @param[in] B input raft::device_vector_view + * @param[out] out the output mean squared error value of type raft::device_scalar_view + * @param[in] weight weight to apply to every term in the mean squared error calculation + */ +template +void mean_squared_error(const raft::handle_t& handle, + raft::device_vector_view A, + raft::device_vector_view B, + raft::device_scalar_view out, + OutValueType weight) +{ + RAFT_EXPECTS(A.size() == B.size(), "Size mismatch between inputs"); + + meanSquaredError( + out.data_handle(), A.data_handle(), B.data_handle(), A.extent(0), weight, handle.get_stream()); +} + +/** @} */ // end of group mean_squared_error + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp index 7a7f03ee18..cbb974e516 100644 --- a/cpp/include/raft/linalg/mean_squared_error.hpp +++ b/cpp/include/raft/linalg/mean_squared_error.hpp @@ -18,35 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __MSE_H -#define __MSE_H - -#pragma once - -#include "detail/mean_squared_error.cuh" - -namespace raft { -namespace linalg { - /** - * @brief CUDA version mean squared error function mean((A-B)**2) - * @tparam math_t data-type upon which the math operation will be performed - * @tparam TPB threads-per-block - * @param out the output mean squared error value (assumed to be a device pointer) - * @param A input array (assumed to be a device pointer) - * @param B input array (assumed to be a device pointer) - * @param len number of elements in the input arrays - * @param weight weight to apply to every term in the mean squared error calculation - * @param stream cuda-stream where to launch this kernel + * DISCLAIMER: this file is deprecated: use mean_squared_error.cuh instead */ -template -void meanSquaredError( - math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) -{ - detail::meanSquaredError(out, A, B, len, weight, stream); -} -}; // end namespace linalg -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "mean_squared_error.cuh" diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh index f1161b23cb..119cf667d1 100644 --- a/cpp/include/raft/linalg/multiply.cuh +++ b/cpp/include/raft/linalg/multiply.cuh @@ -20,12 +20,17 @@ #include "detail/multiply.cuh" +#include +#include +#include + namespace raft { namespace linalg { /** * @defgroup ScalarOps Scalar operations on the input buffer - * @tparam math_t data-type upon which the math operation will be performed + * @tparam out_t data-type upon which the math operation will be performed + * @tparam in_t input data-type * @tparam IdxType Integer type used to for addressing * @param out the output buffer * @param in the input buffer @@ -34,13 +39,64 @@ namespace linalg { * @param stream cuda stream where to launch work * @{ */ -template -void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +template +void multiplyScalar(out_t* out, const in_t* in, in_t scalar, IdxType len, cudaStream_t stream) { detail::multiplyScalar(out, in, scalar, len, stream); } /** @} */ +/** + * @defgroup multiply Multiplication Arithmetic + * @{ + */ + +/** + * @brief Element-wise multiplication of host scalar + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @tparam ScalarIdxType Index Type of scalar + * @param[in] handle raft::handle_t + * @param[in] in the input buffer + * @param[out] out the output buffer + * @param[in] scalar the scalar used in the operations + * @{ + */ +template , + typename = raft::enable_if_output_device_mdspan> +void multiply_scalar( + const raft::handle_t& handle, + InType in, + OutType out, + raft::host_scalar_view scalar) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + if (out.size() <= std::numeric_limits::max()) { + multiplyScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + multiplyScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** @} */ // end of group multiply + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp index eb933cd607..5aa481a894 100644 --- a/cpp/include/raft/linalg/multiply.hpp +++ b/cpp/include/raft/linalg/multiply.hpp @@ -18,35 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __MULTIPLY_H -#define __MULTIPLY_H - -#pragma once - -#include "detail/multiply.cuh" - -namespace raft { -namespace linalg { - /** - * @defgroup ScalarOps Scalar operations on the input buffer - * @tparam math_t data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing - * @param out the output buffer - * @param in the input buffer - * @param scalar the scalar used in the operations - * @param len number of elements in the input buffer - * @param stream cuda stream where to launch work - * @{ + * DISCLAIMER: this file is deprecated: use multiply.cuh instead */ -template -void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) -{ - detail::multiplyScalar(out, in, scalar, len, stream); -} -/** @} */ -}; // end namespace linalg -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "multiply.cuh" diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh index 87bd2a2b0a..389affef13 100644 --- a/cpp/include/raft/linalg/norm.cuh +++ b/cpp/include/raft/linalg/norm.cuh @@ -19,6 +19,10 @@ #pragma once #include "detail/norm.cuh" +#include "linalg_types.hpp" + +#include +#include namespace raft { namespace linalg { @@ -88,6 +92,61 @@ void colNorm(Type* dots, detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op); } +/** + * @brief Compute norm of the input matrix and perform fin_op + * @tparam ElementType Input/Output data type + * @tparam LayoutPolicy the layout of input (raft::row_major or raft::col_major) + * @tparam IdxType Integer type used to for addressing + * @tparam Lambda device final lambda + * @param[in] handle raft::handle_t + * @param[in] in the input raft::device_matrix_view + * @param[out] out the output raft::device_vector_view + * @param[in] type the type of norm to be applied + * @param[in] apply Whether to apply the norm along rows (raft::linalg::Apply::ALONG_ROWS) + or along columns (raft::linalg::Apply::ALONG_COLUMNS) + * @param[in] fin_op the final lambda op + */ +template > +void norm(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_vector_view out, + NormType type, + Apply apply, + Lambda fin_op = raft::Nop()) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + + auto constexpr row_major = std::is_same_v; + auto along_rows = apply == Apply::ALONG_ROWS; + + if (along_rows) { + RAFT_EXPECTS(static_cast(out.size()) == in.extent(0), + "Output should be equal to number of rows in Input"); + rowNorm(out.data_handle(), + in.data_handle(), + in.extent(1), + in.extent(0), + type, + row_major, + handle.get_stream(), + fin_op); + } else { + RAFT_EXPECTS(static_cast(out.size()) == in.extent(1), + "Output should be equal to number of columns in Input"); + colNorm(out.data_handle(), + in.data_handle(), + in.extent(1), + in.extent(0), + type, + row_major, + handle.get_stream(), + fin_op); + } +} + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp index 958784d67e..b750367f05 100644 --- a/cpp/include/raft/linalg/norm.hpp +++ b/cpp/include/raft/linalg/norm.hpp @@ -18,82 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __NORM_H -#define __NORM_H - -#pragma once - -#include "detail/norm.cuh" - -namespace raft { -namespace linalg { - -/** different types of norms supported on the input buffers */ -using detail::L1Norm; -using detail::L2Norm; -using detail::NormType; - /** - * @brief Compute row-wise norm of the input matrix and perform fin_op lambda - * - * Row-wise norm is useful while computing pairwise distance matrix, for - * example. - * This is used in many clustering algos like knn, kmeans, dbscan, etc... The - * current implementation is optimized only for bigger values of 'D'. - * - * @tparam Type the data type - * @tparam Lambda device final lambda - * @tparam IdxType Integer type used to for addressing - * @param dots the output vector of row-wise dot products - * @param data the input matrix (currently assumed to be row-major) - * @param D number of columns of data - * @param N number of rows of data - * @param type the type of norm to be applied - * @param rowMajor whether the input is row-major or not - * @param stream cuda stream where to launch work - * @param fin_op the final lambda op + * DISCLAIMER: this file is deprecated: use norm.cuh instead */ -template > -void rowNorm(Type* dots, - const Type* data, - IdxType D, - IdxType N, - NormType type, - bool rowMajor, - cudaStream_t stream, - Lambda fin_op = raft::Nop()) -{ - detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op); -} -/** - * @brief Compute column-wise norm of the input matrix and perform fin_op - * @tparam Type the data type - * @tparam Lambda device final lambda - * @tparam IdxType Integer type used to for addressing - * @param dots the output vector of column-wise dot products - * @param data the input matrix (currently assumed to be row-major) - * @param D number of columns of data - * @param N number of rows of data - * @param type the type of norm to be applied - * @param rowMajor whether the input is row-major or not - * @param stream cuda stream where to launch work - * @param fin_op the final lambda op - */ -template > -void colNorm(Type* dots, - const Type* data, - IdxType D, - IdxType N, - NormType type, - bool rowMajor, - cudaStream_t stream, - Lambda fin_op = raft::Nop()) -{ - detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op); -} +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "norm.cuh" diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh index f94fcfc894..acd226b71d 100644 --- a/cpp/include/raft/linalg/power.cuh +++ b/cpp/include/raft/linalg/power.cuh @@ -18,17 +18,19 @@ #pragma once -#include +#include #include #include +#include +#include namespace raft { namespace linalg { /** * @defgroup ScalarOps Scalar operations on the input buffer - * @tparam math_t data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing + * @tparam in_t Input data-type + * @tparam out_t Output data-type * @param out the output buffer * @param in the input buffer * @param scalar the scalar used in the operations @@ -36,17 +38,18 @@ namespace linalg { * @param stream cuda stream where to launch work * @{ */ -template -void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +template +void powerScalar(out_t* out, const in_t* in, const in_t scalar, IdxType len, cudaStream_t stream) { raft::linalg::unaryOp( - out, in, len, [scalar] __device__(math_t in) { return raft::myPow(in, scalar); }, stream); + out, in, len, [scalar] __device__(in_t in) { return raft::myPow(in, scalar); }, stream); } /** @} */ /** * @defgroup BinaryOps Element-wise binary operations on the input buffers - * @tparam math_t data-type upon which the math operation will be performed + * @tparam in_t Input data-type + * @tparam out_t Output data-type * @tparam IdxType Integer type used to for addressing * @param out the output buffer * @param in1 the first input buffer @@ -55,14 +58,103 @@ void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cuda * @param stream cuda stream where to launch work * @{ */ -template -void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaStream_t stream) +template +void power(out_t* out, const in_t* in1, const in_t* in2, IdxType len, cudaStream_t stream) { raft::linalg::binaryOp( - out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream); + out, in1, in2, len, [] __device__(in_t a, in_t b) { return raft::myPow(a, b); }, stream); } /** @} */ +/** + * @defgroup power Power Arithmetic + * @{ + */ + +/** + * @brief Elementwise power operation on the input buffers + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @param[in] handle raft::handle_t + * @param[in] in1 First Input + * @param[in] in2 Second Input + * @param[out] out Output + */ +template , + typename = raft::enable_if_output_device_mdspan> +void power(const raft::handle_t& handle, InType in1, InType in2, OutType out) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous"); + RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(), + "Size mismatch between Output and Inputs"); + + if (out.size() <= std::numeric_limits::max()) { + power(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + power(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** + * @brief Elementwise power of host scalar to input + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @tparam ScalarIdxType Index Type of scalar + * @param[in] handle raft::handle_t + * @param[in] in Input + * @param[out] out Output + * @param[in] scalar raft::host_scalar_view + */ +template , + typename = raft::enable_if_output_device_mdspan> +void power_scalar( + const raft::handle_t& handle, + InType in, + OutType out, + const raft::host_scalar_view scalar) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + if (out.size() <= std::numeric_limits::max()) { + powerScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + powerScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** @} */ // end of group add + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/power.hpp b/cpp/include/raft/linalg/power.hpp index d1506ff7a9..1e4a56d4fb 100644 --- a/cpp/include/raft/linalg/power.hpp +++ b/cpp/include/raft/linalg/power.hpp @@ -18,57 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __POWER_H -#define __POWER_H - -#pragma once - -#include -#include -#include - -namespace raft { -namespace linalg { - /** - * @defgroup ScalarOps Scalar operations on the input buffer - * @tparam math_t data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing - * @param out the output buffer - * @param in the input buffer - * @param scalar the scalar used in the operations - * @param len number of elements in the input buffer - * @param stream cuda stream where to launch work - * @{ + * DISCLAIMER: this file is deprecated: use power.cuh instead */ -template -void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) -{ - raft::linalg::unaryOp( - out, in, len, [scalar] __device__(math_t in) { return raft::myPow(in, scalar); }, stream); -} -/** @} */ -/** - * @defgroup BinaryOps Element-wise binary operations on the input buffers - * @tparam math_t data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing - * @param out the output buffer - * @param in1 the first input buffer - * @param in2 the second input buffer - * @param len number of elements in the input buffers - * @param stream cuda stream where to launch work - * @{ - */ -template -void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaStream_t stream) -{ - raft::linalg::binaryOp( - out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream); -} -/** @} */ +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "power.cuh" diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index fe6a5263ca..7e6e14e680 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -36,7 +36,6 @@ namespace linalg { * @param n_rows: number rows of input matrix * @param n_cols: number columns of input matrix * @param stream cuda stream - * @{ */ template void qrGetQ(const raft::handle_t& handle, @@ -70,6 +69,47 @@ void qrGetQR(const raft::handle_t& handle, { detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream); } + +/** + * @brief Compute the QR decomposition of matrix M and return only the Q matrix. + * @param[in] handle raft::handle_t + * @param[in] M Input raft::device_matrix_view + * @param[out] Q Output raft::device_matrix_view + */ +template +void qr_get_q(const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_matrix_view Q) +{ + RAFT_EXPECTS(Q.size() == M.size(), "Size mismatch between Output and Input"); + + qrGetQ(handle, M.data_handle(), Q.data_handle(), M.extent(0), M.extent(1), handle.get_stream()); +} + +/** + * @brief Compute the QR decomposition of matrix M and return both the Q and R matrices. + * @param[in] handle raft::handle_t + * @param[in] M Input raft::device_matrix_view + * @param[in] Q Output raft::device_matrix_view + * @param[out] R Output raft::device_matrix_view + */ +template +void qr_get_qr(const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_matrix_view Q, + raft::device_matrix_view R) +{ + RAFT_EXPECTS(Q.size() == M.size(), "Size mismatch between Output and Input"); + + qrGetQR(handle, + M.data_handle(), + Q.data_handle(), + R.data_handle(), + M.extent(0), + M.extent(1), + handle.get_stream()); +} + /** @} */ }; // namespace linalg diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index 7640da8c2d..9b3f4ee347 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -19,6 +19,10 @@ #pragma once #include "detail/reduce.cuh" +#include "linalg_types.hpp" + +#include +#include namespace raft { namespace linalg { @@ -60,8 +64,8 @@ template > void reduce(OutType* dots, const InType* data, - int D, - int N, + IdxType D, + IdxType N, OutType init, bool rowMajor, bool alongRows, @@ -71,10 +75,91 @@ void reduce(OutType* dots, ReduceLambda reduce_op = raft::Sum(), FinalLambda final_op = raft::Nop()) { - detail::reduce( + detail::reduce( dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op); } +/** + * @defgroup reduction Reduction Along Requested Dimension + * @{ + */ + +/** + * @brief Compute reduction of the input matrix along the requested dimension + * This API computes a reduction of a matrix whose underlying storage + * is either row-major or column-major, while allowing the choose the + * dimension for reduction. Depending upon the dimension chosen for + * reduction, the memory accesses may be coalesced or strided. + * + * @tparam InElementType the input data-type of underlying raft::matrix_view + * @tparam LayoutPolicy The layout of Input/Output (row or col major) + * @tparam OutElementType the output data-type of underlying raft::matrix_view and reduction + * @tparam IndexType Integer type used to for addressing + * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*MainLambda)(InType, IdxType);
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*ReduceLambda)(OutType);
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*FinalLambda)(OutType);
+ * @param[in] handle raft::handle_t + * @param[in] data Input of type raft::device_matrix_view + * @param[out] dots Output of type raft::device_matrix_view + * @param[in] init initial value to use for the reduction + * @param[in] apply whether to reduce along rows or along columns (using raft::linalg::Apply) + * @param[in] main_op fused elementwise operation to apply before reduction + * @param[in] reduce_op fused binary reduction operation + * @param[in] final_op fused elementwise operation to apply before storing results + * @param[in] inplace reduction result added inplace or overwrites old values? + */ +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void reduce(const raft::handle_t& handle, + raft::device_matrix_view data, + raft::device_vector_view dots, + OutElementType init, + Apply apply, + bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(data), "Input must be contiguous"); + + auto constexpr row_major = std::is_same_v; + bool along_rows = apply == Apply::ALONG_ROWS; + + if (along_rows) { + RAFT_EXPECTS(static_cast(dots.size()) == data.extent(1), + "Output should be equal to number of columns in Input"); + } else { + RAFT_EXPECTS(static_cast(dots.size()) == data.extent(0), + "Output should be equal to number of rows in Input"); + } + + reduce(dots.data_handle(), + data.data_handle(), + data.extent(1), + data.extent(0), + init, + row_major, + along_rows, + handle.get_stream(), + inplace, + main_op, + reduce_op, + final_op); +} + +/** @} */ // end of group reduction + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp index b9cc2c6e9d..b965cfac7b 100644 --- a/cpp/include/raft/linalg/reduce.hpp +++ b/cpp/include/raft/linalg/reduce.hpp @@ -18,69 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __REDUCE_H -#define __REDUCE_H - -#pragma once - -#include "detail/reduce.cuh" - -namespace raft { -namespace linalg { - /** - * @brief Compute reduction of the input matrix along the requested dimension - * - * @tparam InType the data type of the input - * @tparam OutType the data type of the output (as well as the data type for - * which reduction is performed) - * @tparam IdxType data type of the indices of the array - * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*MainLambda)(InType, IdxType);
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*ReduceLambda)(OutType);
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*FinalLambda)(OutType);
- * @param dots the output reduction vector - * @param data the input matrix - * @param D number of columns - * @param N number of rows - * @param init initial value to use for the reduction - * @param rowMajor input matrix is row-major or not - * @param alongRows whether to reduce along rows or columns - * @param stream cuda stream where to launch work - * @param inplace reduction result added inplace or overwrites old values? - * @param main_op elementwise operation to apply before reduction - * @param reduce_op binary reduction operation - * @param final_op elementwise operation to apply before storing results + * DISCLAIMER: this file is deprecated: use reduce.cuh instead */ -template , - typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void reduce(OutType* dots, - const InType* data, - int D, - int N, - OutType init, - bool rowMajor, - bool alongRows, - cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), - ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) -{ - detail::reduce( - dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op); -} -}; // end namespace linalg -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "reduce.cuh" diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh index 2336639258..a7917f21f8 100644 --- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh @@ -18,7 +18,10 @@ #pragma once -#include +#include "detail/reduce_cols_by_key.cuh" + +#include +#include namespace raft { namespace linalg { @@ -52,6 +55,52 @@ void reduce_cols_by_key(const T* data, { detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream); } + +/** + * @defgroup reduce_cols_by_key Reduce Across Columns by Key + * @{ + */ + +/** + * @brief Computes the sum-reduction of matrix columns for each given key + * TODO: Support generic reduction lambdas https://github.com/rapidsai/raft/issues/860 + * @tparam ElementType the input data type (as well as the output reduced matrix) + * @tparam KeyType data type of the keys + * @tparam IndexType indexing arithmetic type + * @param[in] handle raft::handle_t + * @param[in] data the input data (dim = nrows x ncols). This is assumed to be in + * row-major layout of type raft::device_matrix_view + * @param[in] keys keys raft::device_vector_view (len = ncols). It is assumed that each key in this + * array is between [0, nkeys). In case this is not true, the caller is expected + * to have called make_monotonic primitive to prepare such a contiguous and + * monotonically increasing keys array. + * @param[out] out the output reduced raft::device_matrix_view along columns (dim = nrows x nkeys). + * This will be assumed to be in row-major layout + * @param[in] nkeys number of unique keys in the keys array + */ +template +void reduce_cols_by_key( + const raft::handle_t& handle, + raft::device_matrix_view data, + raft::device_vector_view keys, + raft::device_matrix_view out, + IndexType nkeys) +{ + RAFT_EXPECTS(out.extent(0) == data.extent(0) && out.extent(1) == nkeys, + "Output is not of size nrows * nkeys"); + RAFT_EXPECTS(keys.extent(0) == data.extent(1), "Keys is not of size ncols"); + + reduce_cols_by_key(data.data_handle(), + keys.data_handle(), + out.data_handle(), + data.extent(0), + data.extent(1), + nkeys, + handle.get_stream()); +} + +/** @} */ // end of group reduce_cols_by_key + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.hpp b/cpp/include/raft/linalg/reduce_cols_by_key.hpp index c24baa60de..70851c2b69 100644 --- a/cpp/include/raft/linalg/reduce_cols_by_key.hpp +++ b/cpp/include/raft/linalg/reduce_cols_by_key.hpp @@ -18,45 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __REDUCE_COLS_BY_KEY -#define __REDUCE_COLS_BY_KEY +/** + * DISCLAIMER: this file is deprecated: use reduce_cols_by_key.cuh instead + */ #pragma once -#include - -namespace raft { -namespace linalg { +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -/** - * @brief Computes the sum-reduction of matrix columns for each given key - * @tparam T the input data type (as well as the output reduced matrix) - * @tparam KeyType data type of the keys - * @tparam IdxType indexing arithmetic type - * @param data the input data (dim = nrows x ncols). This is assumed to be in - * row-major layout - * @param keys keys array (len = ncols). It is assumed that each key in this - * array is between [0, nkeys). In case this is not true, the caller is expected - * to have called make_monotonic primitive to prepare such a contiguous and - * monotonically increasing keys array. - * @param out the output reduced matrix along columns (dim = nrows x nkeys). - * This will be assumed to be in row-major layout - * @param nrows number of rows in the input data - * @param ncols number of colums in the input data - * @param nkeys number of unique keys in the keys array - * @param stream cuda stream to launch the kernel onto - */ -template -void reduce_cols_by_key(const T* data, - const KeyIteratorT keys, - T* out, - IdxType nrows, - IdxType ncols, - IdxType nkeys, - cudaStream_t stream) -{ - detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream); -} -}; // end namespace linalg -}; // end namespace raft -#endif \ No newline at end of file +#include "reduce_cols_by_key.cuh" diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh index ca7a956986..39c54e8b0c 100644 --- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh @@ -18,7 +18,10 @@ #pragma once -#include +#include "detail/reduce_rows_by_key.cuh" + +#include +#include namespace raft { namespace linalg { @@ -53,7 +56,7 @@ void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) * @param[in] stream CUDA stream */ template -void reduce_rows_by_key(const DataIteratorT d_A, +void reduce_rows_by_key(const DataIteratorT* d_A, int lda, const KeysIteratorT d_keys, const WeightT* d_weights, @@ -61,7 +64,7 @@ void reduce_rows_by_key(const DataIteratorT d_A, int nrows, int ncols, int nkeys, - DataIteratorT d_sums, + DataIteratorT* d_sums, cudaStream_t stream) { detail::reduce_rows_by_key( @@ -85,17 +88,17 @@ void reduce_rows_by_key(const DataIteratorT d_A, * @param[in] stream CUDA stream */ template -void reduce_rows_by_key(const DataIteratorT d_A, +void reduce_rows_by_key(const DataIteratorT* d_A, int lda, - const KeysIteratorT d_keys, + KeysIteratorT d_keys, char* d_keys_char, int nrows, int ncols, int nkeys, - DataIteratorT d_sums, + DataIteratorT* d_sums, cudaStream_t stream) { - typedef typename std::iterator_traits::value_type DataType; + typedef typename std::iterator_traits::value_type DataType; reduce_rows_by_key(d_A, lda, d_keys, @@ -108,6 +111,69 @@ void reduce_rows_by_key(const DataIteratorT d_A, stream); } +/** + * @defgroup reduce_rows_by_key Reduce Across Rows by Key + * @{ + */ + +/** + * @brief Computes the weighted sum-reduction of matrix rows for each given key + * TODO: Support generic reduction lambdas https://github.com/rapidsai/raft/issues/860 + * @tparam ElementType data-type of input and output + * @tparam KeyType data-type of keys + * @tparam WeightType data-type of weights + * @tparam IndexType index type + * @param[in] handle raft::handle_t + * @param[in] d_A Input raft::device_mdspan (ncols * nrows) + * @param[in] d_keys Keys for each row raft::device_vector_view (1 x nrows) + * @param[out] d_sums Row sums by key raft::device_matrix_view (ncols x d_keys) + * @param[in] n_unique_keys Number of unique keys in d_keys + * @param[in] d_weights Weights for each observation in d_A raft::device_vector_view optional (1 + * x nrows) + * @param[out] d_keys_char Scratch memory for conversion of keys to char, raft::device_vector_view + */ +template +void reduce_rows_by_key( + const raft::handle_t& handle, + raft::device_matrix_view d_A, + raft::device_vector_view d_keys, + raft::device_matrix_view d_sums, + IndexType n_unique_keys, + raft::device_vector_view d_keys_char, + std::optional> d_weights = std::nullopt) +{ + RAFT_EXPECTS(d_A.extent(0) == d_A.extent(0) && d_sums.extent(1) == n_unique_keys, + "Output is not of size ncols * n_unique_keys"); + RAFT_EXPECTS(d_keys.extent(0) == d_A.extent(1), "Keys is not of size nrows"); + + if (d_weights) { + RAFT_EXPECTS(d_weights.value().extent(0) == d_A.extent(1), "Weights is not of size nrows"); + + reduce_rows_by_key(d_A.data_handle(), + d_A.extent(0), + d_keys.data_handle(), + d_weights.value().data_handle(), + d_keys_char.data_handle(), + d_A.extent(1), + d_A.extent(0), + n_unique_keys, + d_sums.data_handle(), + handle.get_stream()); + } else { + reduce_rows_by_key(d_A.data_handle(), + d_A.extent(0), + d_keys.data_handle(), + d_keys_char.data_handle(), + d_A.extent(1), + d_A.extent(0), + n_unique_keys, + d_sums.data_handle(), + handle.get_stream()); + } +} + +/** @} */ // end of group reduce_rows_by_key + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.hpp b/cpp/include/raft/linalg/reduce_rows_by_key.hpp index d18a00aa1d..4b5e76ea8f 100644 --- a/cpp/include/raft/linalg/reduce_rows_by_key.hpp +++ b/cpp/include/raft/linalg/reduce_rows_by_key.hpp @@ -18,102 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __REDUCE_ROWS_BY_KEY -#define __REDUCE_ROWS_BY_KEY - -#pragma once - -#include - -namespace raft { -namespace linalg { - -/** - Small helper function to convert from int->char and char->int - Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars -**/ -template -void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) -{ - detail::convert_array(dst, src, n, st); -} - /** - * @brief Computes the weighted reduction of matrix rows for each given key - * - * @tparam DataIteratorT Random-access iterator type, for reading input matrix - * (may be a simple pointer type) - * @tparam KeysIteratorT Random-access iterator type, for reading input keys - * (may be a simple pointer type) - * - * @param[in] d_A Input data array (lda x nrows) - * @param[in] lda Real row size for input data, d_A - * @param[in] d_keys Keys for each row (1 x nrows) - * @param[in] d_weights Weights for each observation in d_A (1 x nrows) - * @param[out] d_keys_char Scratch memory for conversion of keys to char - * @param[in] nrows Number of rows in d_A and d_keys - * @param[in] ncols Number of data columns in d_A - * @param[in] nkeys Number of unique keys in d_keys - * @param[out] d_sums Row sums by key (ncols x d_keys) - * @param[in] stream CUDA stream + * DISCLAIMER: this file is deprecated: use reduce_rows_by_key.cuh instead */ -template -void reduce_rows_by_key(const DataIteratorT d_A, - int lda, - const KeysIteratorT d_keys, - const WeightT* d_weights, - char* d_keys_char, - int nrows, - int ncols, - int nkeys, - DataIteratorT d_sums, - cudaStream_t stream) -{ - detail::reduce_rows_by_key( - d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream); -} -/** - * @brief Computes the reduction of matrix rows for each given key - * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple - * pointer type) - * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple - * pointer type) - * @param[in] d_A Input data array (lda x nrows) - * @param[in] lda Real row size for input data, d_A - * @param[in] d_keys Keys for each row (1 x nrows) - * @param d_keys_char Scratch memory for conversion of keys to char - * @param[in] nrows Number of rows in d_A and d_keys - * @param[in] ncols Number of data columns in d_A - * @param[in] nkeys Number of unique keys in d_keys - * @param[out] d_sums Row sums by key (ncols x d_keys) - * @param[in] stream CUDA stream - */ -template -void reduce_rows_by_key(const DataIteratorT d_A, - int lda, - const KeysIteratorT d_keys, - char* d_keys_char, - int nrows, - int ncols, - int nkeys, - DataIteratorT d_sums, - cudaStream_t stream) -{ - typedef typename std::iterator_traits::value_type DataType; - reduce_rows_by_key(d_A, - lda, - d_keys, - static_cast(nullptr), - d_keys_char, - nrows, - ncols, - nkeys, - d_sums, - stream); -} +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "reduce_rows_by_key.cuh" diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh index f5eaba7526..e465ee6fa2 100644 --- a/cpp/include/raft/linalg/rsvd.cuh +++ b/cpp/include/raft/linalg/rsvd.cuh @@ -18,7 +18,9 @@ #pragma once -#include +#include "detail/rsvd.cuh" + +#include namespace raft { namespace linalg { @@ -137,6 +139,653 @@ void rsvdPerc(const raft::handle_t& handle, stream); } +/** + * @defgroup rsvd Randomized Singular Value Decomposition + * @{ + */ + +/** + * @brief randomized singular value decomposition (RSVD) on a column major + * rectangular matrix using QR decomposition, by specifying no. of PCs and + * upsamples directly + * @param[in] handle raft::handle_t + * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] S_vec singular values raft::device_vector_view of shape (K) + * @param[in] p no. of upsamples + * @param[out] U optional left singular values of raft::device_matrix_view with layout + * raft::col_major + * @param[out] V optional right singular values of raft::device_matrix_view with layout + * raft::col_major + */ +template +void rsvd_fixed_rank( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + IndexType p, + std::optional> U = std::nullopt, + std::optional> V = std::nullopt) +{ + if (U) { + RAFT_EXPECTS(M.extent(0) == U.value().extent(0), "Number of rows in M should be equal to U"); + RAFT_EXPECTS(S_vec.extent(0) == U.value().extent(1), + "Number of columns in U should be equal to length of S"); + } + if (V) { + RAFT_EXPECTS(M.extent(1) == V.value().extent(1), "Number of columns in M should be equal to V"); + RAFT_EXPECTS(S_vec.extent(0) == V.value().extent(0), + "Number of rows in V should be equal to length of S"); + } + + rsvdFixedRank(handle, + const_cast(M.data_handle()), + M.extent(0), + M.extent(1), + S_vec.data_handle(), + U.value().data_handle(), + V.value().data_handle(), + S_vec.extent(0), + p, + false, + U.has_value(), + V.has_value(), + false, + static_cast(0), + 0, + handle.get_stream()); +} + +/** + * @brief Overload of `rsvd_fixed_rank` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `rsvd_fixed_rank`. + */ +template +void rsvd_fixed_rank(const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + IndexType p, + ValueType tol, + int max_sweeps, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + rsvd_fixed_rank(handle, M, S_vec, p, tol, max_sweeps, U_optional, V_optional); +} + +/** + * @brief randomized singular value decomposition (RSVD) on a column major + * rectangular matrix using symmetric Eigen decomposition, by specifying no. of PCs and + * upsamples directly. The rectangular input matrix is made square and symmetric using B @ B^T + * @param[in] handle raft::handle_t + * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] S_vec singular values raft::device_vector_view of shape (K) + * @param[in] p no. of upsamples + * @param[out] U optional left singular values of raft::device_matrix_view with layout + * raft::col_major + * @param[out] V optional right singular values of raft::device_matrix_view with layout + * raft::col_major + */ +template +void rsvd_fixed_rank_symmetric( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + IndexType p, + std::optional> U = std::nullopt, + std::optional> V = std::nullopt) +{ + if (U) { + RAFT_EXPECTS(M.extent(0) == U.value().extent(0), "Number of rows in M should be equal to U"); + RAFT_EXPECTS(S_vec.extent(0) == U.value().extent(1), + "Number of columns in U should be equal to length of S"); + } + if (V) { + RAFT_EXPECTS(M.extent(1) == V.value().extent(1), "Number of columns in M should be equal to V"); + RAFT_EXPECTS(S_vec.extent(0) == V.value().extent(0), + "Number of rows in V should be equal to length of S"); + } + + rsvdFixedRank(handle, + const_cast(M.data_handle()), + M.extent(0), + M.extent(1), + S_vec.data_handle(), + U.value().data_handle(), + V.value().data_handle(), + S_vec.extent(0), + p, + true, + U.has_value(), + V.has_value(), + false, + static_cast(0), + 0, + handle.get_stream()); +} + +/** + * @brief Overload of `rsvd_fixed_rank_symmetric` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `rsvd_fixed_rank_symmetric`. + */ +template +void rsvd_fixed_rank_symmetric( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + IndexType p, + ValueType tol, + int max_sweeps, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + rsvd_fixed_rank_symmetric(handle, M, S_vec, p, tol, max_sweeps, U_optional, V_optional); +} + +/** + * @brief randomized singular value decomposition (RSVD) on a column major + * rectangular matrix using Jacobi method, by specifying no. of PCs and + * upsamples directly + * @param[in] handle raft::handle_t + * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] S_vec singular values raft::device_vector_view of shape (K) + * @param[in] p no. of upsamples + * @param[in] tol tolerance for Jacobi-based solvers + * @param[in] max_sweeps maximum number of sweeps for Jacobi-based solvers + * @param[out] U optional left singular values of raft::device_matrix_view with layout + * raft::col_major + * @param[out] V optional right singular values of raft::device_matrix_view with layout + * raft::col_major + */ +template +void rsvd_fixed_rank_jacobi( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + IndexType p, + ValueType tol, + int max_sweeps, + std::optional> U = std::nullopt, + std::optional> V = std::nullopt) +{ + if (U) { + RAFT_EXPECTS(M.extent(0) == U.value().extent(0), "Number of rows in M should be equal to U"); + RAFT_EXPECTS(S_vec.extent(0) == U.value().extent(1), + "Number of columns in U should be equal to length of S"); + } + if (V) { + RAFT_EXPECTS(M.extent(1) == V.value().extent(1), "Number of columns in M should be equal to V"); + RAFT_EXPECTS(S_vec.extent(0) == V.value().extent(0), + "Number of rows in V should be equal to length of S"); + } + + rsvdFixedRank(handle, + const_cast(M.data_handle()), + M.extent(0), + M.extent(1), + S_vec.data_handle(), + U.value().data_handle(), + V.value().data_handle(), + S_vec.extent(0), + p, + false, + U.has_value(), + V.has_value(), + true, + tol, + max_sweeps, + handle.get_stream()); +} + +/** + * @brief Overload of `rsvd_fixed_rank_jacobi` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `rsvd_fixed_rank_jacobi`. + */ +template +void rsvd_fixed_rank_jacobi(const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + IndexType p, + ValueType tol, + int max_sweeps, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + rsvd_fixed_rank_sjacobi(handle, M, S_vec, p, tol, max_sweeps, U_optional, V_optional); +} + +/** + * @brief randomized singular value decomposition (RSVD) on a column major + * rectangular matrix using Jacobi method, by specifying no. of PCs and + * upsamples directly. The rectangular input matrix is made square and symmetric using B @ B^T + * @param[in] handle raft::handle_t + * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] S_vec singular values raft::device_vector_view of shape (K) + * @param[in] p no. of upsamples + * @param[in] tol tolerance for Jacobi-based solvers + * @param[in] max_sweeps maximum number of sweeps for Jacobi-based solvers + * @param[out] U optional left singular values of raft::device_matrix_view with layout + * raft::col_major + * @param[out] V optional right singular values of raft::device_matrix_view with layout + * raft::col_major + */ +template +void rsvd_fixed_rank_symmetric_jacobi( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + IndexType p, + ValueType tol, + int max_sweeps, + std::optional> U = std::nullopt, + std::optional> V = std::nullopt) +{ + if (U) { + RAFT_EXPECTS(M.extent(0) == U.value().extent(0), "Number of rows in M should be equal to U"); + RAFT_EXPECTS(S_vec.extent(0) == U.value().extent(1), + "Number of columns in U should be equal to length of S"); + } + if (V) { + RAFT_EXPECTS(M.extent(1) == V.value().extent(1), "Number of columns in M should be equal to V"); + RAFT_EXPECTS(S_vec.extent(0) == V.value().extent(0), + "Number of rows in V should be equal to length of S"); + } + + rsvdFixedRank(handle, + const_cast(M.data_handle()), + M.extent(0), + M.extent(1), + S_vec.data_handle(), + U.value().data_handle(), + V.value().data_handle(), + S_vec.extent(0), + p, + true, + U.has_value(), + V.has_value(), + true, + tol, + max_sweeps, + handle.get_stream()); +} + +/** + * @brief Overload of `rsvd_fixed_rank_symmetric_jacobi` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `rsvd_fixed_rank_symmetric_jacobi`. + */ +template +void rsvd_fixed_rank_symmetric_jacobi( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + IndexType p, + ValueType tol, + int max_sweeps, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + rsvd_fixed_rank_symmetric_jacobi(handle, M, S_vec, p, tol, max_sweeps, U_optional, V_optional); +} + +/** + * @brief randomized singular value decomposition (RSVD) on a column major + * rectangular matrix using QR decomposition, by specifying the PC and upsampling + * ratio + * @param[in] handle raft::handle_t + * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] S_vec singular values raft::device_vector_view of shape (K) + * @param[in] PC_perc percentage of singular values to be computed + * @param[in] UpS_perc upsampling percentage + * @param[out] U optional left singular values of raft::device_matrix_view with layout + * raft::col_major + * @param[out] V optional right singular values of raft::device_matrix_view with layout + * raft::col_major + */ +template +void rsvd_perc( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + ValueType PC_perc, + ValueType UpS_perc, + std::optional> U = std::nullopt, + std::optional> V = std::nullopt) +{ + if (U) { + RAFT_EXPECTS(M.extent(0) == U.value().extent(0), "Number of rows in M should be equal to U"); + RAFT_EXPECTS(S_vec.extent(0) == U.value().extent(1), + "Number of columns in U should be equal to length of S"); + } + if (V) { + RAFT_EXPECTS(M.extent(1) == V.value().extent(1), "Number of columns in M should be equal to V"); + RAFT_EXPECTS(S_vec.extent(0) == V.value().extent(0), + "Number of rows in V should be equal to length of S"); + } + + rsvdPerc(handle, + const_cast(M.data_handle()), + M.extent(0), + M.extent(1), + S_vec.data_handle(), + U.value().data_handle(), + V.value().data_handle(), + PC_perc, + UpS_perc, + false, + U.has_value(), + V.has_value(), + false, + static_cast(0), + 0, + handle.get_stream()); +} + +/** + * @brief Overload of `rsvd_perc` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `rsvd_perc`. + */ +template +void rsvd_perc(const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + ValueType PC_perc, + ValueType UpS_perc, + ValueType tol, + int max_sweeps, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + rsvd_perc(handle, M, S_vec, PC_perc, UpS_perc, tol, max_sweeps, U_optional, V_optional); +} + +/** + * @brief randomized singular value decomposition (RSVD) on a column major + * rectangular matrix using symmetric Eigen decomposition, by specifying the PC and upsampling + * ratio. The rectangular input matrix is made square and symmetric using B @ B^T + * @param[in] handle raft::handle_t + * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] S_vec singular values raft::device_vector_view of shape (K) + * @param[in] PC_perc percentage of singular values to be computed + * @param[in] UpS_perc upsampling percentage + * @param[out] U optional left singular values of raft::device_matrix_view with layout + * raft::col_major + * @param[out] V optional right singular values of raft::device_matrix_view with layout + * raft::col_major + */ +template +void rsvd_perc_symmetric( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + ValueType PC_perc, + ValueType UpS_perc, + std::optional> U = std::nullopt, + std::optional> V = std::nullopt) +{ + if (U) { + RAFT_EXPECTS(M.extent(0) == U.value().extent(0), "Number of rows in M should be equal to U"); + RAFT_EXPECTS(S_vec.extent(0) == U.value().extent(1), + "Number of columns in U should be equal to length of S"); + } + if (V) { + RAFT_EXPECTS(M.extent(1) == V.value().extent(1), "Number of columns in M should be equal to V"); + RAFT_EXPECTS(S_vec.extent(0) == V.value().extent(0), + "Number of rows in V should be equal to length of S"); + } + + rsvdPerc(handle, + const_cast(M.data_handle()), + M.extent(0), + M.extent(1), + S_vec.data_handle(), + U.value().data_handle(), + V.value().data_handle(), + PC_perc, + UpS_perc, + true, + U.has_value(), + V.has_value(), + false, + static_cast(0), + 0, + handle.get_stream()); +} + +/** + * @brief Overload of `rsvd_perc_symmetric` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `rsvd_perc_symmetric`. + */ +template +void rsvd_perc_symmetric(const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + ValueType PC_perc, + ValueType UpS_perc, + ValueType tol, + int max_sweeps, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + rsvd_perc_symmetric(handle, M, S_vec, PC_perc, UpS_perc, tol, max_sweeps, U_optional, V_optional); +} + +/** + * @brief randomized singular value decomposition (RSVD) on a column major + * rectangular matrix using Jacobi method, by specifying the PC and upsampling + * ratio + * @param[in] handle raft::handle_t + * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] S_vec singular values raft::device_vector_view of shape (K) + * @param[in] PC_perc percentage of singular values to be computed + * @param[in] UpS_perc upsampling percentage + * @param[in] tol tolerance for Jacobi-based solvers + * @param[in] max_sweeps maximum number of sweeps for Jacobi-based solvers + * @param[out] U optional left singular values of raft::device_matrix_view with layout + * raft::col_major + * @param[out] V optional right singular values of raft::device_matrix_view with layout + * raft::col_major + */ +template +void rsvd_perc_jacobi( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + ValueType PC_perc, + ValueType UpS_perc, + ValueType tol, + int max_sweeps, + std::optional> U = std::nullopt, + std::optional> V = std::nullopt) +{ + if (U) { + RAFT_EXPECTS(M.extent(0) == U.value().extent(0), "Number of rows in M should be equal to U"); + RAFT_EXPECTS(S_vec.extent(0) == U.value().extent(1), + "Number of columns in U should be equal to length of S"); + } + if (V) { + RAFT_EXPECTS(M.extent(1) == V.value().extent(1), "Number of columns in M should be equal to V"); + RAFT_EXPECTS(S_vec.extent(0) == V.value().extent(0), + "Number of rows in V should be equal to length of S"); + } + + rsvdPerc(handle, + const_cast(M.data_handle()), + M.extent(0), + M.extent(1), + S_vec.data_handle(), + U.value().data_handle(), + V.value().data_handle(), + PC_perc, + UpS_perc, + false, + U.has_value(), + V.has_value(), + true, + tol, + max_sweeps, + handle.get_stream()); +} + +/** + * @brief Overload of `rsvd_perc_jacobi` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `rsvd_perc_jacobi`. + */ +template +void rsvd_perc_jacobi(const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + ValueType PC_perc, + ValueType UpS_perc, + ValueType tol, + int max_sweeps, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + rsvd_perc_jacobi(handle, M, S_vec, PC_perc, UpS_perc, tol, max_sweeps, U_optional, V_optional); +} + +/** + * @brief randomized singular value decomposition (RSVD) on a column major + * rectangular matrix using Jacobi method, by specifying the PC and upsampling + * ratio. The rectangular input matrix is made square and symmetric using B @ B^T + * @param[in] handle raft::handle_t + * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] S_vec singular values raft::device_vector_view of shape (K) + * @param[in] PC_perc percentage of singular values to be computed + * @param[in] UpS_perc upsampling percentage + * @param[in] tol tolerance for Jacobi-based solvers + * @param[in] max_sweeps maximum number of sweeps for Jacobi-based solvers + * @param[out] U optional left singular values of raft::device_matrix_view with layout + * raft::col_major + * @param[out] V optional right singular values of raft::device_matrix_view with layout + * raft::col_major + */ +template +void rsvd_perc_symmetric_jacobi( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + ValueType PC_perc, + ValueType UpS_perc, + ValueType tol, + int max_sweeps, + std::optional> U = std::nullopt, + std::optional> V = std::nullopt) +{ + if (U) { + RAFT_EXPECTS(M.extent(0) == U.value().extent(0), "Number of rows in M should be equal to U"); + RAFT_EXPECTS(S_vec.extent(0) == U.value().extent(1), + "Number of columns in U should be equal to length of S"); + } + if (V) { + RAFT_EXPECTS(M.extent(1) == V.value().extent(1), "Number of columns in M should be equal to V"); + RAFT_EXPECTS(S_vec.extent(0) == V.value().extent(0), + "Number of rows in V should be equal to length of S"); + } + + rsvdPerc(handle, + const_cast(M.data_handle()), + M.extent(0), + M.extent(1), + S_vec.data_handle(), + U.value().data_handle(), + V.value().data_handle(), + PC_perc, + UpS_perc, + true, + U.has_value(), + V.has_value(), + true, + tol, + max_sweeps, + handle.get_stream()); +} + +/** + * @brief Overload of `rsvd_perc_symmetric_jacobi` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `rsvd_perc_symmetric_jacobi`. + */ +template +void rsvd_perc_symmetric_jacobi( + const raft::handle_t& handle, + raft::device_matrix_view M, + raft::device_vector_view S_vec, + ValueType PC_perc, + ValueType UpS_perc, + ValueType tol, + int max_sweeps, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + rsvd_perc_symmetric_jacobi( + handle, M, S_vec, PC_perc, UpS_perc, tol, max_sweeps, U_optional, V_optional); +} + +/** @} */ // end of group rsvd + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/rsvd.hpp b/cpp/include/raft/linalg/rsvd.hpp index ac6e13b555..7e2fffba75 100644 --- a/cpp/include/raft/linalg/rsvd.hpp +++ b/cpp/include/raft/linalg/rsvd.hpp @@ -18,131 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __RSVD_H -#define __RSVD_H - -#pragma once - -#include - -namespace raft { -namespace linalg { - /** - * @brief randomized singular value decomposition (RSVD) on the column major - * float type input matrix (Jacobi-based), by specifying no. of PCs and - * upsamples directly - * @param handle: raft handle - * @param M: input matrix - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param S_vec: singular values of input matrix - * @param U: left singular values of input matrix - * @param V: right singular values of input matrix - * @param k: no. of singular values to be computed - * @param p: no. of upsamples - * @param use_bbt: whether use eigen decomposition in computation or not - * @param gen_left_vec: left vector needs to be generated or not? - * @param gen_right_vec: right vector needs to be generated or not? - * @param use_jacobi: whether to jacobi solver for decomposition - * @param tol: tolerance for Jacobi-based solvers - * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers - * @param stream cuda stream + * DISCLAIMER: this file is deprecated: use rsvd.cuh instead */ -template -void rsvdFixedRank(const raft::handle_t& handle, - math_t* M, - int n_rows, - int n_cols, - math_t* S_vec, - math_t* U, - math_t* V, - int k, - int p, - bool use_bbt, - bool gen_left_vec, - bool gen_right_vec, - bool use_jacobi, - math_t tol, - int max_sweeps, - cudaStream_t stream) -{ - detail::rsvdFixedRank(handle, - M, - n_rows, - n_cols, - S_vec, - U, - V, - k, - p, - use_bbt, - gen_left_vec, - gen_right_vec, - use_jacobi, - tol, - max_sweeps, - stream); -} -/** - * @brief randomized singular value decomposition (RSVD) on the column major - * float type input matrix (Jacobi-based), by specifying the PC and upsampling - * ratio - * @param handle: raft handle - * @param M: input matrix - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param S_vec: singular values of input matrix - * @param U: left singular values of input matrix - * @param V: right singular values of input matrix - * @param PC_perc: percentage of singular values to be computed - * @param UpS_perc: upsampling percentage - * @param use_bbt: whether use eigen decomposition in computation or not - * @param gen_left_vec: left vector needs to be generated or not? - * @param gen_right_vec: right vector needs to be generated or not? - * @param use_jacobi: whether to jacobi solver for decomposition - * @param tol: tolerance for Jacobi-based solvers - * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers - * @param stream cuda stream - */ -template -void rsvdPerc(const raft::handle_t& handle, - math_t* M, - int n_rows, - int n_cols, - math_t* S_vec, - math_t* U, - math_t* V, - math_t PC_perc, - math_t UpS_perc, - bool use_bbt, - bool gen_left_vec, - bool gen_right_vec, - bool use_jacobi, - math_t tol, - int max_sweeps, - cudaStream_t stream) -{ - detail::rsvdPerc(handle, - M, - n_rows, - n_cols, - S_vec, - U, - V, - PC_perc, - UpS_perc, - use_bbt, - gen_left_vec, - gen_right_vec, - use_jacobi, - tol, - max_sweeps, - stream); -} +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "rsvd.cuh" diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh index b58bc752ac..2951285c3a 100644 --- a/cpp/include/raft/linalg/sqrt.cuh +++ b/cpp/include/raft/linalg/sqrt.cuh @@ -18,8 +18,9 @@ #pragma once -#include +#include #include +#include namespace raft { namespace linalg { @@ -34,14 +35,55 @@ namespace linalg { * @param stream cuda stream where to launch work * @{ */ -template -void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream) +template +void sqrt(out_t* out, const in_t* in, IdxType len, cudaStream_t stream) { raft::linalg::unaryOp( - out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream); + out, in, len, [] __device__(in_t in) { return raft::mySqrt(in); }, stream); } /** @} */ +/** + * @defgroup sqrt Sqrt Arithmetic + * @{ + */ + +/** + * @brief Elementwise sqrt operation + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @param[in] handle raft::handle_t + * @param[in] in Input + * @param[out] out Output + */ +template , + typename = raft::enable_if_output_device_mdspan> +void sqrt(const raft::handle_t& handle, InType in, OutType out) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input 1 must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Inputs"); + + if (out.size() <= std::numeric_limits::max()) { + sqrt(out.data_handle(), + in.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + sqrt(out.data_handle(), + in.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** @} */ // end of group add + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/sqrt.hpp b/cpp/include/raft/linalg/sqrt.hpp index 9c66ee2d14..e0f77f0ab9 100644 --- a/cpp/include/raft/linalg/sqrt.hpp +++ b/cpp/include/raft/linalg/sqrt.hpp @@ -18,36 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SQRT_H -#define __SQRT_H - -#pragma once - -#include -#include - -namespace raft { -namespace linalg { - /** - * @defgroup ScalarOps Scalar operations on the input buffer - * @tparam math_t data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing - * @param out the output buffer - * @param in the input buffer - * @param len number of elements in the input buffer - * @param stream cuda stream where to launch work - * @{ + * DISCLAIMER: this file is deprecated: use sqrt.cuh instead */ -template -void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream) -{ - raft::linalg::unaryOp( - out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream); -} -/** @} */ -}; // end namespace linalg -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "sqrt.cuh" diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh index 941e64dcb1..9147692c03 100644 --- a/cpp/include/raft/linalg/strided_reduction.cuh +++ b/cpp/include/raft/linalg/strided_reduction.cuh @@ -21,6 +21,9 @@ #include "detail/strided_reduction.cuh" +#include +#include + namespace raft { namespace linalg { @@ -68,9 +71,94 @@ void stridedReduction(OutType* dots, ReduceLambda reduce_op = raft::Sum(), FinalLambda final_op = raft::Nop()) { - detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + detail::stridedReduction( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } +/** + * @defgroup strided_reduction Strided Memory Access Reductions + * For reducing along rows for row-major and along columns for column-major + * @{ + */ + +/** + * @brief Compute reduction of the input matrix along the strided dimension + * This API is to be used when the desired reduction is NOT along the dimension + * of the memory layout. For example, a row-major matrix will be reduced + * along the rows whereas a column-major matrix will be reduced along + * the columns. + * + * @tparam InValueType the input data-type of underlying raft::matrix_view + * @tparam LayoutPolicy The layout of Input/Output (row or col major) + * @tparam OutValueType the output data-type of underlying raft::matrix_view and reduction + * @tparam IndexType Integer type used to for addressing + * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*MainLambda)(InType, IdxType);
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*ReduceLambda)(OutType);
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*FinalLambda)(OutType);
+ * @param[in] handle raft::handle_t + * @param[in] data Input of type raft::device_matrix_view + * @param[out] dots Output of type raft::device_matrix_view + * @param[in] init initial value to use for the reduction + * @param[in] main_op fused elementwise operation to apply before reduction + * @param[in] reduce_op fused binary reduction operation + * @param[in] final_op fused elementwise operation to apply before storing results + * @param[in] inplace reduction result added inplace or overwrites old values? + */ +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void strided_reduction(const raft::handle_t& handle, + raft::device_matrix_view data, + raft::device_vector_view dots, + OutValueType init, + bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) +{ + if constexpr (std::is_same_v) { + RAFT_EXPECTS(static_cast(dots.size()) == data.extent(1), + "Output should be equal to number of columns in Input"); + + stridedReduction(dots.data_handle(), + data.data_handle(), + data.extent(1), + data.extent(0), + init, + handle.get_stream(), + inplace, + main_op, + reduce_op, + final_op); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS(static_cast(dots.size()) == data.extent(0), + "Output should be equal to number of rows in Input"); + + stridedReduction(dots.data_handle(), + data.data_handle(), + data.extent(0), + data.extent(1), + init, + handle.get_stream(), + inplace, + main_op, + reduce_op, + final_op); + } +} + +/** @} */ // end of group strided_reduction + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp index 3b1597dfc3..6720a302ea 100644 --- a/cpp/include/raft/linalg/strided_reduction.hpp +++ b/cpp/include/raft/linalg/strided_reduction.hpp @@ -18,64 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __STRIDED_REDUCTION_H -#define __STRIDED_REDUCTION_H - -#pragma once - -#include "detail/strided_reduction.cuh" - -namespace raft { -namespace linalg { - /** - * @brief Compute reduction of the input matrix along the strided dimension - * - * @tparam InType the data type of the input - * @tparam OutType the data type of the output (as well as the data type for - * which reduction is performed) - * @tparam IdxType data type of the indices of the array - * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*MainLambda)(InType, IdxType);
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*ReduceLambda)(OutType);
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*FinalLambda)(OutType);
- * @param dots the output reduction vector - * @param data the input matrix - * @param D leading dimension of data - * @param N second dimension data - * @param init initial value to use for the reduction - * @param main_op elementwise operation to apply before reduction - * @param reduce_op binary reduction operation - * @param final_op elementwise operation to apply before storing results - * @param inplace reduction result added inplace or overwrites old values? - * @param stream cuda stream where to launch work + * DISCLAIMER: this file is deprecated: use strided_reduction.cuh instead */ -template , - typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void stridedReduction(OutType* dots, - const InType* data, - IdxType D, - IdxType N, - OutType init, - cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), - ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) -{ - detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); -} -}; // end namespace linalg -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "strided_reduction.cuh" diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh index 9ca36ddddf..4f81822a13 100644 --- a/cpp/include/raft/linalg/subtract.cuh +++ b/cpp/include/raft/linalg/subtract.cuh @@ -21,6 +21,10 @@ #include "detail/subtract.cuh" +#include +#include +#include + namespace raft { namespace linalg { @@ -84,6 +88,140 @@ void subtractDevScalar(math_t* outDev, detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream); } +/** + * @defgroup sub Subtraction Arithmetic + * @{ + */ + +/** + * @brief Elementwise subtraction operation on the input buffers + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @param handle raft::handle_t + * @param[in] in1 First Input + * @param[in] in2 Second Input + * @param[out] out Output + */ +template , + typename = raft::enable_if_output_device_mdspan> +void subtract(const raft::handle_t& handle, InType in1, InType in2, OutType out) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous"); + RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(), + "Size mismatch between Output and Inputs"); + + if (out.size() <= std::numeric_limits::max()) { + subtract(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + subtract(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** + * @brief Elementwise subtraction of device scalar to input + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @tparam ScalarIdxType Index Type of scalar + * @param[in] handle raft::handle_t + * @param[in] in Input + * @param[out] out Output + * @param[in] scalar raft::device_scalar_view + */ +template , + typename = raft::enable_if_output_device_mdspan> +void subtract_scalar( + const raft::handle_t& handle, + InType in, + OutType out, + raft::device_scalar_view scalar) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + if (out.size() <= std::numeric_limits::max()) { + subtractDevScalar( + out.data_handle(), + in.data_handle(), + scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + subtractDevScalar( + out.data_handle(), + in.data_handle(), + scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** + * @brief Elementwise subtraction of host scalar to input + * @tparam InType Input Type raft::device_mdspan + * @tparam OutType Output Type raft::device_mdspan + * @tparam ScalarIdxType Index Type of scalar + * @param[in] handle raft::handle_t + * @param[in] in Input + * @param[out] out Output + * @param[in] scalar raft::host_scalar_view + */ +template , + typename = raft::enable_if_output_device_mdspan> +void subtract_scalar( + const raft::handle_t& handle, + InType in, + OutType out, + raft::host_scalar_view scalar) +{ + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + if (out.size() <= std::numeric_limits::max()) { + subtractScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } else { + subtractScalar(out.data_handle(), + in.data_handle(), + *scalar.data_handle(), + static_cast(out.size()), + handle.get_stream()); + } +} + +/** @} */ // end of group subtract + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp index accf57a939..b0c6508ffe 100644 --- a/cpp/include/raft/linalg/subtract.hpp +++ b/cpp/include/raft/linalg/subtract.hpp @@ -18,77 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SUBTRACT_H -#define __SUBTRACT_H - -#pragma once - -#include "detail/subtract.cuh" - -namespace raft { -namespace linalg { - /** - * @brief Elementwise scalar subtraction operation on the input buffer - * - * @tparam InT input data-type. Also the data-type upon which the math ops - * will be performed - * @tparam OutT output data-type - * @tparam IdxType Integer type used to for addressing - * - * @param out the output buffer - * @param in the input buffer - * @param scalar the scalar used in the operations - * @param len number of elements in the input buffer - * @param stream cuda stream where to launch work + * DISCLAIMER: this file is deprecated: use subtract.cuh instead */ -template -void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) -{ - detail::subtractScalar(out, in, scalar, len, stream); -} -/** - * @brief Elementwise subtraction operation on the input buffers - * @tparam InT input data-type. Also the data-type upon which the math ops - * will be performed - * @tparam OutT output data-type - * @tparam IdxType Integer type used to for addressing - * - * @param out the output buffer - * @param in1 the first input buffer - * @param in2 the second input buffer - * @param len number of elements in the input buffers - * @param stream cuda stream where to launch work - */ -template -void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) -{ - detail::subtract(out, in1, in2, len, stream); -} - -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and - * write result to outDev[i] - * @tparam math_t data-type upon which the math operation will be performed - * @tparam IdxType Integer type used to for addressing - * @param outDev the output buffer - * @param inDev the input buffer - * @param singleScalarDev pointer to the scalar located in device memory - * @param len number of elements in the input and output buffer - * @param stream cuda stream - * @remark block size has not been tuned - */ -template -void subtractDevScalar(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len, - cudaStream_t stream) -{ - detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream); -} +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "subtract.cuh" diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index b48def90a3..0026ec1f7d 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -20,6 +20,8 @@ #include "detail/svd.cuh" +#include + namespace raft { namespace linalg { @@ -38,9 +40,6 @@ namespace linalg { * @param gen_right_vec: generate right eig vector. Not activated. * @param stream cuda stream */ -// TODO: activate gen_left_vec and gen_right_vec options -// TODO: couldn't template this function due to cusolverDnSgesvd and -// cusolverSnSgesvd. Check if there is any other way. template void svdQR(const raft::handle_t& handle, T* in, @@ -182,6 +181,219 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle, return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream); } +/** + * @defgroup svd Singular Value Decomposition + * @{ + */ + +/** + * @brief singular value decomposition (SVD) on a column major + * matrix using QR decomposition + * @param[in] handle raft::handle_t + * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] sing_vals singular values raft::device_vector_view of shape (K) + * @param[out] left_sing_vecs optional left singular values of raft::device_matrix_view with layout + * raft::col_major and dimensions (m, n) + * @param[out] right_sing_vecs optional right singular values of raft::device_matrix_view with + * layout raft::col_major and dimensions (n, n) + */ +template +void svd_qr( + const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_vector_view sing_vals, + std::optional> left_sing_vecs = + std::nullopt, + std::optional> right_sing_vecs = + std::nullopt) +{ + if (left_sing_vecs) { + RAFT_EXPECTS(in.extent(0) == left_sing_vecs.value().extent(0) && + in.extent(1) == left_sing_vecs.value().extent(1), + "U should have dimensions m * n"); + } + if (right_sing_vecs) { + RAFT_EXPECTS(in.extent(1) == right_sing_vecs.value().extent(0) && + in.extent(1) == right_sing_vecs.value().extent(1), + "V should have dimensions n * n"); + } + svdQR(handle, + const_cast(in.data_handle()), + in.extent(0), + in.extent(1), + sing_vals.data_handle(), + left_sing_vecs.value().data_handle(), + right_sing_vecs.value().data_handle(), + false, + left_sing_vecs.has_value(), + right_sing_vecs.has_value(), + handle.get_stream()); +} + +/** + * @brief Overload of `svd_qr` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `svd_qr`. + */ +template +void svd_qr(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_vector_view sing_vals, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + svd_qr(handle, in, sing_vals, U_optional, V_optional); +} + +/** + * @brief singular value decomposition (SVD) on a column major + * matrix using QR decomposition. Right singular vector matrix is transposed before returning + * @param[in] handle raft::handle_t + * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] sing_vals singular values raft::device_vector_view of shape (K) + * @param[out] left_sing_vecs optional left singular values of raft::device_matrix_view with layout + * raft::col_major and dimensions (m, n) + * @param[out] right_sing_vecs optional right singular values of raft::device_matrix_view with + * layout raft::col_major and dimensions (n, n) + */ +template +void svd_qr_transpose_right_vec( + const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_vector_view sing_vals, + std::optional> left_sing_vecs = + std::nullopt, + std::optional> right_sing_vecs = + std::nullopt) +{ + if (left_sing_vecs) { + RAFT_EXPECTS(in.extent(0) == left_sing_vecs.value().extent(0) && + in.extent(1) == left_sing_vecs.value().extent(1), + "U should have dimensions m * n"); + } + if (right_sing_vecs) { + RAFT_EXPECTS(in.extent(1) == right_sing_vecs.value().extent(0) && + in.extent(1) == right_sing_vecs.value().extent(1), + "V should have dimensions n * n"); + } + svdQR(handle, + const_cast(in.data_handle()), + in.extent(0), + in.extent(1), + sing_vals.data_handle(), + left_sing_vecs.value().data_handle(), + right_sing_vecs.value().data_handle(), + true, + left_sing_vecs.has_value(), + right_sing_vecs.has_value(), + handle.get_stream()); +} + +/** + * @brief Overload of `svd_qr_transpose_right_vec` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `svd_qr_transpose_right_vec`. + */ +template +void svd_qr_transpose_right_vec( + const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_vector_view sing_vals, + UType&& U, + VType&& V) +{ + std::optional> U_optional = + std::forward(U); + std::optional> V_optional = + std::forward(V); + + svd_qr_transpose_right_vec(handle, in, sing_vals, U_optional, V_optional); +} + +/** + * @brief singular value decomposition (SVD) on a column major + * matrix using Eigen decomposition. A square symmetric covariance matrix is constructed for the SVD + * @param[in] handle raft::handle_t + * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N) + * @param[out] S singular values raft::device_vector_view of shape (K) + * @param[out] V right singular values of raft::device_matrix_view with layout + * raft::col_major and dimensions (m, n) + * @param[out] U optional left singular values of raft::device_matrix_view with layout + * raft::col_major and dimensions (m, n) + */ +template +void svd_eig( + const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_vector_view S, + raft::device_matrix_view V, + std::optional> U = std::nullopt) +{ + if (U) { + RAFT_EXPECTS(in.extent(0) == U.value().extent(0) && in.extent(1) == U.value().extent(1), + "U should have dimensions m * n"); + } + RAFT_EXPECTS(in.extent(0) == V.extent(0) && in.extent(1) == V.extent(1), + "V should have dimensions n * n"); + svdEig(handle, + const_cast(in.data_handle()), + in.extent(0), + in.extent(1), + S.data_handle(), + U.value().data_handle(), + V.value().data_handle(), + U.has_value(), + handle.get_stream()); +} + +/** + * @brief reconstruct a matrix use left and right singular vectors and + * singular values + * @param[in] handle raft::handle_t + * @param[in] U left singular values of raft::device_matrix_view with layout + * raft::col_major and dimensions (m, k) + * @param[in] S singular values raft::device_vector_view of shape (k, k) + * @param[in] V right singular values of raft::device_matrix_view with layout + * raft::col_major and dimensions (k, n) + * @param[out] out output raft::device_matrix_view with layout raft::col_major of shape (m, n) + */ +template +void svd_reconstruction(const raft::handle_t& handle, + raft::device_matrix_view U, + raft::device_vector_view S, + raft::device_matrix_view V, + raft::device_matrix_view out) +{ + RAFT_EXPECTS(S.extent(0) == S.extent(1), "S should be a square matrix"); + RAFT_EXPECTS(S.extent(0) == U.extent(1), + "Number of rows of S should be equal to number of columns in U"); + RAFT_EXPECTS(S.extent(1) == V.extent(0), + "Number of columns of S should be equal to number of rows in V"); + RAFT_EXPECTS(out.extent(0) == U.extent(0) && out.extent(1) == V.extent(1), + "Number of rows should be equal in out and U and number of columns should be equal " + "in out and V"); + + svdReconstruction(handle, + const_cast(U.data_handle()), + const_cast(S.data_handle()), + const_cast(V.data_handle()), + out.extent(0), + out.extent(1), + S.extent(0), + handle.get_stream()); +} + +/** @} */ // end of group svd + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp index 01788a4188..26bce80388 100644 --- a/cpp/include/raft/linalg/svd.hpp +++ b/cpp/include/raft/linalg/svd.hpp @@ -18,176 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SVD_H -#define __SVD_H - -#pragma once - -#include "detail/svd.cuh" - -namespace raft { -namespace linalg { - /** - * @brief singular value decomposition (SVD) on the column major float type - * input matrix using QR method - * @param handle: raft handle - * @param in: input matrix - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param sing_vals: singular values of input matrix - * @param left_sing_vecs: left singular values of input matrix - * @param right_sing_vecs: right singular values of input matrix - * @param trans_right: transpose right vectors or not - * @param gen_left_vec: generate left eig vector. Not activated. - * @param gen_right_vec: generate right eig vector. Not activated. - * @param stream cuda stream + * DISCLAIMER: this file is deprecated: use svd.cuh instead */ -// TODO: activate gen_left_vec and gen_right_vec options -// TODO: couldn't template this function due to cusolverDnSgesvd and -// cusolverSnSgesvd. Check if there is any other way. -template -void svdQR(const raft::handle_t& handle, - T* in, - int n_rows, - int n_cols, - T* sing_vals, - T* left_sing_vecs, - T* right_sing_vecs, - bool trans_right, - bool gen_left_vec, - bool gen_right_vec, - cudaStream_t stream) -{ - detail::svdQR(handle, - in, - n_rows, - n_cols, - sing_vals, - left_sing_vecs, - right_sing_vecs, - trans_right, - gen_left_vec, - gen_right_vec, - stream); -} - -template -void svdEig(const raft::handle_t& handle, - T* in, - int n_rows, - int n_cols, - T* S, - T* U, - T* V, - bool gen_left_vec, - cudaStream_t stream) -{ - detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream); -} -/** - * @brief on the column major input matrix using Jacobi method - * @param handle: raft handle - * @param in: input matrix - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param sing_vals: singular values of input matrix - * @param left_sing_vecs: left singular vectors of input matrix - * @param right_sing_vecs: right singular vectors of input matrix - * @param gen_left_vec: generate left eig vector. Not activated. - * @param gen_right_vec: generate right eig vector. Not activated. - * @param tol: error tolerance for the jacobi method. Algorithm stops when the - * error is below tol - * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better - * accuracy. - * @param stream cuda stream - */ -template -void svdJacobi(const raft::handle_t& handle, - math_t* in, - int n_rows, - int n_cols, - math_t* sing_vals, - math_t* left_sing_vecs, - math_t* right_sing_vecs, - bool gen_left_vec, - bool gen_right_vec, - math_t tol, - int max_sweeps, - cudaStream_t stream) -{ - detail::svdJacobi(handle, - in, - n_rows, - n_cols, - sing_vals, - left_sing_vecs, - right_sing_vecs, - gen_left_vec, - gen_right_vec, - tol, - max_sweeps, - stream); -} - -/** - * @brief reconstruct a matrix use left and right singular vectors and - * singular values - * @param handle: raft handle - * @param U: left singular vectors of size n_rows x k - * @param S: square matrix with singular values on its diagonal, k x k - * @param V: right singular vectors of size n_cols x k - * @param out: reconstructed matrix to be returned - * @param n_rows: number rows of output matrix - * @param n_cols: number columns of output matrix - * @param k: number of singular values - * @param stream cuda stream - */ -template -void svdReconstruction(const raft::handle_t& handle, - math_t* U, - math_t* S, - math_t* V, - math_t* out, - int n_rows, - int n_cols, - int k, - cudaStream_t stream) -{ - detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream); -} - -/** - * @brief reconstruct a matrix use left and right singular vectors and - * singular values - * @param handle: raft handle - * @param A_d: input matrix - * @param U: left singular vectors of size n_rows x k - * @param S_vec: singular values as a vector - * @param V: right singular vectors of size n_cols x k - * @param n_rows: number rows of output matrix - * @param n_cols: number columns of output matrix - * @param k: number of singular values to be computed, 1.0 for normal SVD - * @param tol: tolerance for the evaluation - * @param stream cuda stream - */ -template -bool evaluateSVDByL2Norm(const raft::handle_t& handle, - math_t* A_d, - math_t* U, - math_t* S_vec, - math_t* V, - int n_rows, - int n_cols, - int k, - math_t tol, - cudaStream_t stream) -{ - return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream); -} +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "svd.cuh" diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh index 158cca168d..10e91a0313 100644 --- a/cpp/include/raft/linalg/ternary_op.cuh +++ b/cpp/include/raft/linalg/ternary_op.cuh @@ -19,7 +19,11 @@ #pragma once -#include +#include "detail/ternary_op.cuh" + +#include +#include +#include namespace raft { namespace linalg { @@ -37,8 +41,8 @@ namespace linalg { * @param op the device-lambda * @param stream cuda stream where to launch work */ -template -void ternaryOp(math_t* out, +template +void ternaryOp(out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, @@ -49,6 +53,64 @@ void ternaryOp(math_t* out, detail::ternaryOp(out, in1, in2, in3, len, op, stream); } +/** + * @defgroup ternary_op Element-Wise Ternary Operation + * @{ + */ + +/** + * @brief perform element-wise ternary operation on the input arrays + * @tparam InType Input Type raft::device_mdspan + * @tparam Lambda the device-lambda performing the actual operation + * @tparam OutType Output Type raft::device_mdspan + * @param[in] handle raft::handle_t + * @param[in] in1 First input + * @param[in] in2 Second input + * @param[in] in3 Third input + * @param[out] out Output + * @param[in] op the device-lambda + * @note Lambda must be a functor with the following signature: + * `OutType func(const InType& val1, const InType& val2, const InType& val3);` + */ +template , + typename = raft::enable_if_output_device_mdspan> +void ternary_op( + const raft::handle_t& handle, InType in1, InType in2, InType in3, OutType out, Lambda op) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in3), "Input 3 must be contiguous"); + RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size() && in2.size() == in3.size(), + "Size mismatch between Output and Inputs"); + + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + if (out.size() <= std::numeric_limits::max()) { + ternaryOp(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + in3.data_handle(), + out.size(), + op, + handle.get_stream()); + } else { + ternaryOp(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + in3.data_handle(), + out.size(), + op, + handle.get_stream()); + } +} + +/** @} */ // end of group ternary_op + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/ternary_op.hpp b/cpp/include/raft/linalg/ternary_op.hpp index bce9eacb11..58dab89609 100644 --- a/cpp/include/raft/linalg/ternary_op.hpp +++ b/cpp/include/raft/linalg/ternary_op.hpp @@ -18,42 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __TERNARY_OP_H -#define __TERNARY_OP_H - -#pragma once - -#include - -namespace raft { -namespace linalg { /** - * @brief perform element-wise ternary operation on the input arrays - * @tparam math_t data-type upon which the math operation will be performed - * @tparam Lambda the device-lambda performing the actual operation - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * @param out the output array - * @param in1 the first input array - * @param in2 the second input array - * @param in3 the third input array - * @param len number of elements in the input array - * @param op the device-lambda - * @param stream cuda stream where to launch work + * DISCLAIMER: this file is deprecated: use ternary_op.cuh instead */ -template -void ternaryOp(math_t* out, - const math_t* in1, - const math_t* in2, - const math_t* in3, - IdxType len, - Lambda op, - cudaStream_t stream) -{ - detail::ternaryOp(out, in1, in2, in3, len, op, stream); -} -}; // end namespace linalg -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "ternary_op.cuh" diff --git a/cpp/include/raft/linalg/transpose.cuh b/cpp/include/raft/linalg/transpose.cuh index cd78a2f495..e765ea7925 100644 --- a/cpp/include/raft/linalg/transpose.cuh +++ b/cpp/include/raft/linalg/transpose.cuh @@ -19,7 +19,7 @@ #pragma once #include "detail/transpose.cuh" -#include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp index caa6bafedf..4c3f9224e4 100644 --- a/cpp/include/raft/linalg/transpose.hpp +++ b/cpp/include/raft/linalg/transpose.hpp @@ -18,49 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __TRANSPOSE_H -#define __TRANSPOSE_H - -#pragma once - -#include "detail/transpose.cuh" - -namespace raft { -namespace linalg { - /** - * @brief transpose on the column major input matrix using Jacobi method - * @param handle: raft handle - * @param in: input matrix - * @param out: output. Transposed input matrix - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param stream: cuda stream + * DISCLAIMER: this file is deprecated: use transpose.cuh instead */ -template -void transpose(const raft::handle_t& handle, - math_t* in, - math_t* out, - int n_rows, - int n_cols, - cudaStream_t stream) -{ - detail::transpose(handle, in, out, n_rows, n_cols, stream); -} -/** - * @brief transpose on the column major input matrix using Jacobi method - * @param inout: input and output matrix - * @param n: number of rows and columns of input matrix - * @param stream: cuda stream - */ -template -void transpose(math_t* inout, int n, cudaStream_t stream) -{ - detail::transpose(inout, n, stream); -} +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "transpose.cuh" diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index f2466df463..a90bda06d5 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -20,6 +20,10 @@ #include "detail/unary_op.cuh" +#include +#include +#include + namespace raft { namespace linalg { @@ -71,6 +75,75 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) detail::writeOnlyUnaryOpCaller(out, len, op, stream); } +/** + * @defgroup unary_op Element-Wise Unary Operations + * @{ + */ + +/** + * @brief perform element-wise binary operation on the input arrays + * @tparam InType Input Type raft::device_mdspan + * @tparam Lambda the device-lambda performing the actual operation + * @tparam OutType Output Type raft::device_mdspan + * @param[in] handle raft::handle_t + * @param[in] in Input + * @param[out] out Output + * @param[in] op the device-lambda + * @note Lambda must be a functor with the following signature: + * `InType func(const InType& val);` + */ +template , + typename = raft::enable_if_output_device_mdspan> +void unary_op(const raft::handle_t& handle, InType in, OutType out, Lambda op) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + if (out.size() <= std::numeric_limits::max()) { + unaryOp( + out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream()); + } else { + unaryOp( + out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream()); + } +} + +/** + * @brief perform element-wise binary operation on the input arrays + * This function does not read from the input + * @tparam InType Input Type raft::device_mdspan + * @tparam Lambda the device-lambda performing the actual operation + * @param[in] handle raft::handle_t + * @param[inout] in Input/Output + * @param[in] op the device-lambda + * @note Lambda must be a functor with the following signature: + * `InType func(const InType& val);` + */ +template > +void write_only_unary_op(const raft::handle_t& handle, InType in, Lambda op) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + + using in_value_t = typename InType::value_type; + + if (in.size() <= std::numeric_limits::max()) { + writeOnlyUnaryOp( + in.data_handle(), in.size(), op, handle.get_stream()); + } else { + writeOnlyUnaryOp( + in.data_handle(), in.size(), op, handle.get_stream()); + } +} + +/** @} */ // end of group unary_op + }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp index ca1e3f9875..2ace126ff1 100644 --- a/cpp/include/raft/linalg/unary_op.hpp +++ b/cpp/include/raft/linalg/unary_op.hpp @@ -18,65 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __UNARY_OP_H -#define __UNARY_OP_H - -#pragma once - -#include "detail/unary_op.cuh" - -namespace raft { -namespace linalg { - /** - * @brief perform element-wise unary operation in the input array - * @tparam InType input data-type - * @tparam Lambda the device-lambda performing the actual operation - * @tparam OutType output data-type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * @param out the output array - * @param in the input array - * @param len number of elements in the input array - * @param op the device-lambda - * @param stream cuda stream where to launch work - * @note Lambda must be a functor with the following signature: - * `OutType func(const InType& val);` + * DISCLAIMER: this file is deprecated: use unary_op.cuh instead */ -template -void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) -{ - detail::unaryOpCaller(out, in, len, op, stream); -} -/** - * @brief Perform an element-wise unary operation into the output array - * - * Compared to `unaryOp()`, this method does not do any reads from any inputs - * - * @tparam OutType output data-type - * @tparam Lambda the device-lambda performing the actual operation - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * - * @param[out] out the output array [on device] [len = len] - * @param[in] len number of elements in the input array - * @param[in] op the device-lambda which must be of the form: - * `void func(OutType* outLocationOffset, IdxType idx);` - * where outLocationOffset will be out + idx. - * @param[in] stream cuda stream where to launch work - */ -template -void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) -{ - detail::writeOnlyUnaryOpCaller(out, len, op, stream); -} +#pragma once -}; // end namespace linalg -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif +#include "unary_op.cuh" diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh index afdec24ebd..d26f5f73cf 100644 --- a/cpp/include/raft/matrix/col_wise_sort.cuh +++ b/cpp/include/raft/matrix/col_wise_sort.cuh @@ -18,10 +18,11 @@ #pragma once +#include +#include #include -namespace raft { -namespace matrix { +namespace raft::matrix { /** * @brief sort columns within each row of row-major input matrix and return sorted indexes @@ -50,7 +51,105 @@ void sort_cols_per_row(const InType* in, detail::sortColumnsPerRow( in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys); } -}; // end namespace matrix -}; // end namespace raft + +/** + * @brief sort columns within each row of row-major input matrix and return sorted indexes + * modelled as key-value sort with key being input matrix and value being index of values + * @tparam in_t: element type of input matrix + * @tparam out_t: element type of output matrix + * @tparam matrix_idx_t: integer type for matrix indexing + * @param[in] handle: raft handle + * @param[in] in: input matrix + * @param[out] out: output value(index) matrix + * @param[out] sorted_keys: Optional, output matrix for sorted keys (input) + */ +template +void sort_cols_per_row(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + std::optional> + sorted_keys = std::nullopt) +{ + RAFT_EXPECTS(in.extent(1) == out.extent(1) && in.extent(0) == out.extent(0), + "Input and output matrices must have the same shape."); + + if (sorted_keys.has_value()) { + RAFT_EXPECTS(in.extent(1) == sorted_keys.value().extent(1) && + in.extent(0) == sorted_keys.value().extent(0), + "Input and `sorted_keys` matrices must have the same shape."); + } + + size_t workspace_size = 0; + bool alloc_workspace = false; + + in_t* keys = sorted_keys.has_value() ? sorted_keys.value().data_handle() : nullptr; + + detail::sortColumnsPerRow(in.data_handle(), + out.data_handle(), + in.extent(0), + in.extent(1), + alloc_workspace, + (void*)nullptr, + workspace_size, + handle.get_stream(), + keys); + + if (alloc_workspace) { + auto workspace = raft::make_device_vector(handle, workspace_size); + + detail::sortColumnsPerRow(in.data_handle(), + out.data_handle(), + in.extent(0), + in.extent(1), + alloc_workspace, + (void*)workspace.data_handle(), + workspace_size, + handle.get_stream(), + keys); + } +} + +namespace sort_cols_per_row_impl { +template +struct sorted_keys_alias { +}; + +template <> +struct sorted_keys_alias { + using type = double; +}; + +template +struct sorted_keys_alias< + std::optional>> { + using type = typename raft::device_matrix_view::value_type; +}; + +template +using sorted_keys_t = typename sorted_keys_alias::type; +} // namespace sort_cols_per_row_impl + +/** + * @brief Overload of `sort_keys_per_row` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `sort_keys_per_row`. + */ +template +void sort_cols_per_row(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + sorted_keys_vector_type sorted_keys) +{ + using sorted_keys_type = sort_cols_per_row_impl::sorted_keys_t< + std::remove_const_t>>; + std::optional> sorted_keys_opt = + std::forward(sorted_keys); + + sort_cols_per_row(handle, in, out, sorted_keys_opt); +} + +}; // end namespace raft::matrix #endif \ No newline at end of file diff --git a/cpp/include/raft/matrix/col_wise_sort.hpp b/cpp/include/raft/matrix/col_wise_sort.hpp index 83a8738219..60c36db9e2 100644 --- a/cpp/include/raft/matrix/col_wise_sort.hpp +++ b/cpp/include/raft/matrix/col_wise_sort.hpp @@ -18,44 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __COL_WISE_SORT_H -#define __COL_WISE_SORT_H +/** + * DISCLAIMER: this file is deprecated: use col_wise_sort.cuh instead + */ #pragma once -#include - -namespace raft { -namespace matrix { - -/** - * @brief sort columns within each row of row-major input matrix and return sorted indexes - * modelled as key-value sort with key being input matrix and value being index of values - * @param in: input matrix - * @param out: output value(index) matrix - * @param n_rows: number rows of input matrix - * @param n_columns: number columns of input matrix - * @param bAllocWorkspace: check returned value, if true allocate workspace passed in workspaceSize - * @param workspacePtr: pointer to workspace memory - * @param workspaceSize: Size of workspace to be allocated - * @param stream: cuda stream to execute prim on - * @param sortedKeys: Optional, output matrix for sorted keys (input) - */ -template -void sort_cols_per_row(const InType* in, - OutType* out, - int n_rows, - int n_columns, - bool& bAllocWorkspace, - void* workspacePtr, - size_t& workspaceSize, - cudaStream_t stream, - InType* sortedKeys = nullptr) -{ - detail::sortColumnsPerRow( - in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys); -} -}; // end namespace matrix -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "col_wise_sort.cuh" diff --git a/cpp/include/raft/matrix/copy.cuh b/cpp/include/raft/matrix/copy.cuh new file mode 100644 index 0000000000..5f1d16485c --- /dev/null +++ b/cpp/include/raft/matrix/copy.cuh @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft::matrix { + +/** + * @brief Copy selected rows of the input matrix into contiguous space. + * + * On exit out[i + k*n_rows] = in[indices[i] + k*n_rows], + * where i = 0..n_rows_indices-1, and k = 0..n_cols-1. + * + * @param[in] handle raft handle + * @param[in] in input matrix + * @param[out] out output matrix + * @param[in] indices of the rows to be copied + */ +template +void copy_rows(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + raft::device_vector_view indices) +{ + RAFT_EXPECTS(in.extent(1) == out.extent(1), + "Input and output matrices must have same number of columns"); + RAFT_EXPECTS(indices.extent(0) == out.extent(0), + "Number of rows in output matrix must equal number of indices"); + detail::copyRows(in.data_handle(), + in.extent(0), + in.extent(1), + out.data_handle(), + indices.data_handle(), + indices.extent(0), + handle.get_stream(), + raft::is_row_major(in)); +} + +/** + * @brief copy matrix operation for column major matrices. + * @param[in] handle: raft handle + * @param[in] in: input matrix + * @param[out] out: output matrix + */ +template +void copy(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out) +{ + RAFT_EXPECTS(in.extent(0) == out.extent(0) && in.extent(1) == out.extent(1), + "Input and output matrix shapes must match."); + + raft::copy_async( + out.data_handle(), in.data_handle(), in.extent(0) * out.extent(1), handle.get_stream()); +} + +/** + * @brief copy matrix operation for column major matrices. First n_rows and + * n_cols of input matrix "in" is copied to "out" matrix. + * @param handle: raft handle for managing resources + * @param in: input matrix + * @param out: output matrix + */ +template +void trunc_zero_origin(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out) +{ + RAFT_EXPECTS(out.extent(0) <= in.extent(0) && out.extent(1) <= in.extent(1), + "Output matrix must have less or equal number of rows and columns"); + + detail::truncZeroOrigin(in.data_handle(), + in.extent(0), + out.data_handle(), + out.extent(0), + out.extent(1), + handle.get_stream()); +} + +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/detail/columnWiseSort.cuh b/cpp/include/raft/matrix/detail/columnWiseSort.cuh index 65febcb6d8..97345aecb6 100644 --- a/cpp/include/raft/matrix/detail/columnWiseSort.cuh +++ b/cpp/include/raft/matrix/detail/columnWiseSort.cuh @@ -20,7 +20,7 @@ #include #include #include -#include +#include #define INST_BLOCK_SORT(keyIn, keyOut, valueInOut, rows, columns, blockSize, elemPT, stream) \ devKeyValSortColumnPerRow<<>>( \ diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 81204bfe66..15f5204382 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -16,9 +16,9 @@ #pragma once -#include -#include -#include +#include +#include +#include #include @@ -83,7 +83,7 @@ struct Linewise { Vec v, w; bool update = true; for (; in < in_end; in += AlignWarp::Value, out += AlignWarp::Value, rowMod += warpPad) { - v.val.internal = __ldcv(in); + *v.vectorized_data() = __ldcv(in); while (rowMod >= rowLen) { rowMod -= rowLen; rowDiv++; @@ -105,7 +105,7 @@ struct Linewise { int l = 0; w.val.data[k] = op(v.val.data[k], (std::ignore = vecs, args[l++])...); } - *out = w.val.internal; + *out = *w.vectorized_data(); } } @@ -138,11 +138,11 @@ struct Linewise { Vec v; const IdxType d = BlockSize * gridDim.x; for (IdxType i = threadIdx.x + blockIdx.x * BlockSize; i < len; i += d) { - v.val.internal = __ldcv(in + i); + *v.vectorized_data() = __ldcv(in + i); #pragma unroll VecElems for (int k = 0; k < VecElems; k++) v.val.data[k] = op(v.val.data[k], args.val.data[k]...); - __stwt(out + i, v.val.internal); + __stwt(out + i, *v.vectorized_data()); } } @@ -172,7 +172,7 @@ struct Linewise { __syncthreads(); { Vec out; - out.val.internal = reinterpret_cast(shm)[threadIdx.x]; + *out.vectorized_data() = reinterpret_cast(shm)[threadIdx.x]; return out; } } diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index 9e996e19d9..07b9ccc12b 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -16,14 +16,14 @@ #pragma once -#include +#include #include -#include #include #include #include #include +#include #include #include @@ -141,7 +141,7 @@ void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t } template -void reciprocal(math_t* in, +void reciprocal(const math_t* in, math_t* out, math_t scalar, int len, @@ -363,8 +363,8 @@ void matrixVectorBinarySub(Type* data, } // Computes the argmax(d_in) column-wise in a DxN matrix -template -__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax) +template +__global__ void argmaxKernel(const T* d_in, int D, int N, IdxT* argmax) { typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -384,19 +384,19 @@ __global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax) if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; } } -template -void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream) +template +void argmax(const math_t* in, int n_rows, int n_cols, idx_t* out, cudaStream_t stream) { int D = n_rows; int N = n_cols; if (D <= 32) { - argmaxKernel<<>>(in, D, N, out); + argmaxKernel<<>>(in, D, N, out); } else if (D <= 64) { - argmaxKernel<<>>(in, D, N, out); + argmaxKernel<<>>(in, D, N, out); } else if (D <= 128) { - argmaxKernel<<>>(in, D, N, out); + argmaxKernel<<>>(in, D, N, out); } else { - argmaxKernel<<>>(in, D, N, out); + argmaxKernel<<>>(in, D, N, out); } RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index 3683132161..c425aad79b 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -16,8 +16,8 @@ #pragma once -#include -#include +#include +#include #include @@ -28,9 +28,9 @@ #include #include #include -#include -#include +#include #include +#include namespace raft { namespace matrix { @@ -67,7 +67,7 @@ void copyRows(const m_t* in, template void truncZeroOrigin( - m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) + const m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) { auto m = out_n_rows; auto k = in_n_rows; @@ -279,7 +279,6 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st { cublasHandle_t cublasH = handle.get_cublas_handle(); m_t normval = 0; - // #TODO: Call from the public API when ready RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, size, in, 1, &normval, stream)); return normval; } diff --git a/cpp/include/raft/matrix/detail/print.hpp b/cpp/include/raft/matrix/detail/print.hpp new file mode 100644 index 0000000000..fc3d14861c --- /dev/null +++ b/cpp/include/raft/matrix/detail/print.hpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace raft::matrix::detail { + +template +void printHost( + const m_t* in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', char v_separator = '\n', ) +{ + for (idx_t i = 0; i < n_rows; i++) { + for (idx_t j = 0; j < n_cols; j++) { + printf("%1.4f%c", in[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator); + } + } +} + +} // end namespace raft::matrix::detail diff --git a/cpp/include/raft/matrix/gather.cuh b/cpp/include/raft/matrix/gather.cuh index 31164b2041..fa6e73de49 100644 --- a/cpp/include/raft/matrix/gather.cuh +++ b/cpp/include/raft/matrix/gather.cuh @@ -15,10 +15,12 @@ */ #pragma once + +#include +#include #include -namespace raft { -namespace matrix { +namespace raft::matrix { /** * @brief gather copies rows from a source matrix into a destination matrix according to a map. @@ -49,6 +51,76 @@ void gather(const MatrixIteratorT in, detail::gather(in, D, N, map, map_length, out, stream); } +/** + * @brief gather copies rows from a source matrix into a destination matrix according to a map. + * + * @tparam matrix_t Matrix element type + * @tparam map_t Map vector type + * @tparam idx_t integer type used for indexing + * @param[in] handle raft handle for managing resources + * @param[in] in Input matrix (assumed to be row-major) + * @param[in] map Vector of gather locations + * @param[out] out Output matrix (assumed to be row-major) + */ +template +void gather(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_vector_view map, + raft::device_matrix_view out) +{ + RAFT_EXPECTS(out.extent(0) == map.extent(0), + "Number of rows in output matrix must equal the size of the map vector"); + RAFT_EXPECTS(out.extent(1) == in.extent(1), + "Number of columns in input and output matrices must be equal."); + + raft::matrix::detail::gather( + const_cast(in.data_handle()), // TODO: There's a better way to handle this + static_cast(in.extent(1)), + static_cast(in.extent(0)), + map.data_handle(), + static_cast(map.extent(0)), + out.data_handle(), + handle.get_stream()); +} + +/** + * @brief gather copies rows from a source matrix into a destination matrix according to a + * transformed map. + * + * @tparam matrix_t Matrix type + * @tparam map_t Map vector type + * @tparam map_xform_t Unary lambda expression or operator type, MapTransformOp's result + * type must be convertible to idx_t (= int) type. + * @tparam idx_t integer type for indexing + * @param[in] handle raft handle for managing resources + * @param[in] in Input matrix (assumed to be row-major) + * @param[in] map Input vector of gather locations + * @param[out] out Output matrix (assumed to be row-major) + * @param[in] transform_op The transformation operation, transforms the map values to idx_t + */ +template +void gather(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_vector_view map, + raft::device_matrix_view out, + map_xform_t transform_op) +{ + RAFT_EXPECTS(out.extent(0) == map.extent(0), + "Number of rows in output matrix must equal the size of the map vector"); + RAFT_EXPECTS(out.extent(1) == in.extent(1), + "Number of columns in input and output matrices must be equal."); + + detail::gather( + const_cast(in.data_handle()), // TODO: There's a better way to handle this + static_cast(in.extent(1)), + static_cast(in.extent(0)), + map, + static_cast(map.extent(0)), + out.data_handle(), + transform_op, + handle.get_stream()); +} + /** * @brief gather copies rows from a source matrix into a destination matrix according to a * transformed map. @@ -124,6 +196,53 @@ void gather_if(const MatrixIteratorT in, detail::gather_if(in, D, N, map, stencil, map_length, out, pred_op, stream); } +/** + * @brief gather_if conditionally copies rows from a source matrix into a destination matrix + * according to a map. + * + * @tparam matrix_t Matrix value type + * @tparam map_t Map vector type + * @tparam stencil_t Stencil vector type + * @tparam unary_pred_t Unary lambda expression or operator type, unary_pred_t's result + * type must be convertible to bool type. + * @tparam idx_t integer type for indexing + * @param[in] handle raft handle for managing resources + * @param[in] in Input matrix (assumed to be row-major) + * @param[in] map Input vector of gather locations + * @param[in] stencil Input vector of stencil or predicate values + * @param[out] out Output matrix (assumed to be row-major) + * @param[in] pred_op Predicate to apply to the stencil values + */ +template +void gather_if(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + raft::device_vector_view map, + raft::device_vector_view stencil, + unary_pred_t pred_op) +{ + RAFT_EXPECTS(out.extent(0) == map.extent(0), + "Number of rows in output matrix must equal the size of the map vector"); + RAFT_EXPECTS(out.extent(1) == in.extent(1), + "Number of columns in input and output matrices must be equal."); + RAFT_EXPECTS(map.extent(0) == stencil.extent(0), + "Number of elements in stencil must equal number of elements in map"); + + detail::gather_if(const_cast(in.data_handle()), + out.extent(1), + out.extent(0), + map.data_handle(), + stencil.data_handle(), + map.extent(0), + out.data_handle(), + pred_op, + handle.get_stream()); +} + /** * @brief gather_if conditionally copies rows from a source matrix into a destination matrix * according to a transformed map. @@ -169,5 +288,58 @@ void gather_if(const MatrixIteratorT in, { detail::gather_if(in, D, N, map, stencil, map_length, out, pred_op, transform_op, stream); } -} // namespace matrix -} // namespace raft + +/** + * @brief gather_if conditionally copies rows from a source matrix into a destination matrix + * according to a transformed map. + * + * @tparam matrix_t Matrix value type, for reading input matrix + * @tparam map_t Vector value type for map + * @tparam stencil_t Vector value type for stencil + * @tparam unary_pred_t Unary lambda expression or operator type, unary_pred_t's result + * type must be convertible to bool type. + * @tparam map_xform_t Unary lambda expression or operator type, map_xform_t's result + * type must be convertible to idx_t (= int) type. + * @tparam idx_t integer type for indexing + * @param[in] handle raft handle for managing resources + * @param[in] in Input matrix (assumed to be row-major) + * @param[in] map Vector of gather locations + * @param[in] stencil Vector of stencil or predicate values + * @param[out] out Output matrix (assumed to be row-major) + * @param[in] pred_op Predicate to apply to the stencil values + * @param[in] transform_op The transformation operation, transforms the map values to idx_t + */ +template +void gather_if(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + raft::device_vector_view map, + raft::device_vector_view stencil, + unary_pred_t pred_op, + map_xform_t transform_op) +{ + RAFT_EXPECTS(out.extent(0) == map.extent(0), + "Number of rows in output matrix must equal the size of the map vector"); + RAFT_EXPECTS(out.extent(1) == in.extent(1), + "Number of columns in input and output matrices must be equal."); + RAFT_EXPECTS(map.extent(0) == stencil.extent(0), + "Number of elements in stencil must equal number of elements in map"); + + detail::gather_if(const_cast(in.data_handle()), + in.extent(1), + in.extent(0), + map.data_handle(), + stencil.data_handle(), + map.extent(0), + out.data_handle(), + pred_op, + transform_op, + handle.get_stream()); +} + +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh new file mode 100644 index 0000000000..e3a6c09fe6 --- /dev/null +++ b/cpp/include/raft/matrix/init.cuh @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft::matrix { +/** + * @brief set values to scalar in matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle + * @param[in] in input matrix + * @param[out] out output matrix. The result is stored in the out matrix + * @param[in] scalar scalar value to fill matrix elements + */ +template +void fill(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + raft::host_scalar_view scalar) +{ + RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must be the same size."); + detail::setValue( + out.data_handle(), in.data_handle(), *(scalar.data_handle()), in.size(), handle.get_stream()); +} +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/linewise_op.cuh b/cpp/include/raft/matrix/linewise_op.cuh new file mode 100644 index 0000000000..6b383b14f5 --- /dev/null +++ b/cpp/include/raft/matrix/linewise_op.cuh @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft::matrix { + +/** + * Run a function over matrix lines (rows or columns) with a variable number + * row-vectors or column-vectors. + * The term `line` here signifies that the lines can be either columns or rows, + * depending on the matrix layout. + * What matters is if the vectors are applied along lines (indices of vectors correspond to + * indices within lines), or across lines (indices of vectors correspond to line numbers). + * @tparam m_t matrix elements type + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @tparam Lambda type of lambda function used for the operation + * @tparam vec_t variadic types of device_vector_view vectors (size m if alongRows, size n + * otherwise) + * @param[in] handle raft handle for managing resources + * @param [out] out result of the operation; can be same as `in`; should be aligned the same + * as `in` to allow faster vectorized memory transfers. + * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long. + * @param [in] alongLines whether vectors are indices along or across lines. + * @param [in] op the operation applied on each line: + * for i in [0..lineLen) and j in [0..nLines): + * out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i]) if alongLines = true + * out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j]) if alongLines = false + * where matrix indexing is row-major ([i, j] = [i + lineLen * j]). + * @param [in] vecs zero or more vectors to be passed as arguments, + * size of each vector is `alongLines ? lineLen : nLines`. + */ +template > +void linewise_op(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + const bool alongLines, + Lambda op, + vec_t... vecs) +{ + constexpr auto is_rowmajor = std::is_same_v; + constexpr auto is_colmajor = std::is_same_v; + + static_assert(is_rowmajor || is_colmajor, + "layout for in and out must be either row or col major"); + + const idx_t lineLen = is_rowmajor ? in.extent(0) : in.extent(1); + const idx_t nLines = is_rowmajor ? in.extent(1) : in.extent(0); + + RAFT_EXPECTS(out.extent(0) == in.extent(0) && out.extent(1) == in.extent(1), + "Input and output must have the same shape."); + + detail::MatrixLinewiseOp<16, 256>::run(out.data_handle(), + in.data_handle(), + lineLen, + nLines, + alongLines, + op, + handle.get_stream(), + vecs.data_handle()...); +} +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh index 9e103afda5..3c2705cf87 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.cuh @@ -14,6 +14,15 @@ * limitations under the License. */ +/** + * This file is deprecated and will be removed in a future release. + * Please use versions in individual header files instead. + */ + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use versions in individual header files instead.") + #ifndef __MATH_H #define __MATH_H @@ -301,8 +310,8 @@ void ratio( * @param out: output vector of size n_cols * @param stream: cuda stream */ -template -void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream) +template +void argmax(const math_t* in, int n_rows, int n_cols, idx_t* out, cudaStream_t stream) { detail::argmax(in, n_rows, n_cols, out, stream); } diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh index 1af7e37dec..3a7e0dad47 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.cuh @@ -14,6 +14,15 @@ * limitations under the License. */ +/** + * This file is deprecated and will be removed in a future release. + * Please use versions in individual header files instead. + */ + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use versions in individual header files instead.") + #ifndef __MATRIX_H #define __MATRIX_H @@ -21,6 +30,7 @@ #include "detail/linewise_op.cuh" #include "detail/matrix.cuh" +#include #include @@ -71,6 +81,24 @@ void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stre raft::copy_async(out, in, n_rows * n_cols, stream); } +/** + * @brief copy matrix operation for column major matrices. + * @param[in] handle: raft handle + * @param[in] in: input matrix + * @param[out] out: output matrix + */ +template +void copy(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out) +{ + RAFT_EXPECTS(in.extent(0) == out.extent(0) && in.extent(1) == out.extent(1), + "Input and output matrix shapes must match."); + + raft::copy_async( + out.data_handle(), in.data_handle(), in.extent(0) * out.extent(1), handle.get_stream()); +} + /** * @brief copy matrix operation for column major matrices. First n_rows and * n_cols of input matrix "in" is copied to "out" matrix. diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp index 7409140d7c..428c914784 100644 --- a/cpp/include/raft/matrix/matrix.hpp +++ b/cpp/include/raft/matrix/matrix.hpp @@ -18,265 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __MATRIX_H -#define __MATRIX_H - -#pragma once - -#include "detail/linewise_op.cuh" -#include "detail/matrix.cuh" - -#include - -namespace raft { -namespace matrix { - -using namespace std; - -/** - * @brief Copy selected rows of the input matrix into contiguous space. - * - * On exit out[i + k*n_rows] = in[indices[i] + k*n_rows], - * where i = 0..n_rows_indices-1, and k = 0..n_cols-1. - * - * @param in input matrix - * @param n_rows number of rows of output matrix - * @param n_cols number of columns of output matrix - * @param out output matrix - * @param indices of the rows to be copied - * @param n_rows_indices number of rows to copy - * @param stream cuda stream - * @param rowMajor whether the matrix has row major layout - */ -template -void copyRows(const m_t* in, - idx_t n_rows, - idx_t n_cols, - m_t* out, - const idx_array_t* indices, - idx_t n_rows_indices, - cudaStream_t stream, - bool rowMajor = false) -{ - detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, rowMajor); -} - -/** - * @brief copy matrix operation for column major matrices. - * @param in: input matrix - * @param out: output matrix - * @param n_rows: number of rows of output matrix - * @param n_cols: number of columns of output matrix - * @param stream: cuda stream - */ -template -void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ - raft::copy_async(out, in, n_rows * n_cols, stream); -} - -/** - * @brief copy matrix operation for column major matrices. First n_rows and - * n_cols of input matrix "in" is copied to "out" matrix. - * @param in: input matrix - * @param in_n_rows: number of rows of input matrix - * @param out: output matrix - * @param out_n_rows: number of rows of output matrix - * @param out_n_cols: number of columns of output matrix - * @param stream: cuda stream - */ -template -void truncZeroOrigin( - m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) -{ - detail::truncZeroOrigin(in, in_n_rows, out, out_n_rows, out_n_cols, stream); -} - -/** - * @brief Columns of a column major matrix is reversed (i.e. first column and - * last column are swapped) - * @param inout: input and output matrix - * @param n_rows: number of rows of input matrix - * @param n_cols: number of columns of input matrix - * @param stream: cuda stream - */ -template -void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ - detail::colReverse(inout, n_rows, n_cols, stream); -} - -/** - * @brief Rows of a column major matrix is reversed (i.e. first row and last - * row are swapped) - * @param inout: input and output matrix - * @param n_rows: number of rows of input matrix - * @param n_cols: number of columns of input matrix - * @param stream: cuda stream - */ -template -void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ - detail::rowReverse(inout, n_rows, n_cols, stream); -} - -/** - * @brief Prints the data stored in GPU memory - * @param in: input matrix - * @param n_rows: number of rows of input matrix - * @param n_cols: number of columns of input matrix - * @param h_separator: horizontal separator character - * @param v_separator: vertical separator character - * @param stream: cuda stream - */ -template -void print(const m_t* in, - idx_t n_rows, - idx_t n_cols, - char h_separator = ' ', - char v_separator = '\n', - cudaStream_t stream = rmm::cuda_stream_default) -{ - detail::print(in, n_rows, n_cols, h_separator, v_separator, stream); -} - -/** - * @brief Prints the data stored in CPU memory - * @param in: input matrix - * @param n_rows: number of rows of input matrix - * @param n_cols: number of columns of input matrix - */ -template -void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) -{ - detail::printHost(in, n_rows, n_cols); -} - -/** - * @brief Slice a matrix (in-place) - * @param in: input matrix - * @param n_rows: number of rows of input matrix - * @param n_cols: number of columns of input matrix - * @param out: output matrix - * @param x1, y1: coordinate of the top-left point of the wanted area (0-based) - * @param x2, y2: coordinate of the bottom-right point of the wanted area - * (1-based) - * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4, - * 3, 0, 1, 4, 3); - * @param stream: cuda stream - */ -template -void sliceMatrix(m_t* in, - idx_t n_rows, - idx_t n_cols, - m_t* out, - idx_t x1, - idx_t y1, - idx_t x2, - idx_t y2, - cudaStream_t stream) -{ - detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream); -} - /** - * @brief Copy the upper triangular part of a matrix to another - * @param src: input matrix with a size of n_rows x n_cols - * @param dst: output matrix with a size of kxk, k = min(n_rows, n_cols) - * @param n_rows: number of rows of input matrix - * @param n_cols: number of columns of input matrix - * @param stream: cuda stream + * DISCLAIMER: this file is deprecated: use matrix.cuh instead */ -template -void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ - detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream); -} -/** - * @brief Initialize a diagonal matrix with a vector - * @param vec: vector of length k = min(n_rows, n_cols) - * @param matrix: matrix of size n_rows x n_cols - * @param n_rows: number of rows of the matrix - * @param n_cols: number of columns of the matrix - * @param stream: cuda stream - */ -template -void initializeDiagonalMatrix( - m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ - detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream); -} - -/** - * @brief Get a square matrix with elements on diagonal reversed (in-place) - * @param in: square input matrix with size len x len - * @param len: size of one side of the matrix - * @param stream: cuda stream - */ -template -void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) -{ - detail::getDiagonalInverseMatrix(in, len, stream); -} - -/** - * @brief Get the L2/F-norm of a matrix/vector - * @param handle - * @param in: input matrix/vector with totally size elements - * @param size: size of the matrix/vector - * @param stream: cuda stream - */ -template -m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream) -{ - return detail::getL2Norm(handle, in, size, stream); -} - -/** - * Run a function over matrix lines (rows or columns) with a variable number - * row-vectors or column-vectors. - * The term `line` here signifies that the lines can be either columns or rows, - * depending on the matrix layout. - * What matters is if the vectors are applied along lines (indices of vectors correspond to - * indices within lines), or across lines (indices of vectors correspond to line numbers). - * - * @param [out] out result of the operation; can be same as `in`; should be aligned the same - * as `in` to allow faster vectorized memory transfers. - * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long. - * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in - * col-major) - * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major) - * @param [in] alongLines whether vectors are indices along or across lines. - * @param [in] op the operation applied on each line: - * for i in [0..lineLen) and j in [0..nLines): - * out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i]) if alongLines = true - * out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j]) if alongLines = false - * where matrix indexing is row-major ([i, j] = [i + lineLen * j]). - * @param [in] stream a cuda stream for the kernels - * @param [in] vecs zero or more vectors to be passed as arguments, - * size of each vector is `alongLines ? lineLen : nLines`. - */ -template -void linewiseOp(m_t* out, - const m_t* in, - const idx_t lineLen, - const idx_t nLines, - const bool alongLines, - Lambda op, - cudaStream_t stream, - Vecs... vecs) -{ - common::nvtx::range fun_scope("linewiseOp-%c-%zu (%zu, %zu)", - alongLines ? 'l' : 'x', - sizeof...(Vecs), - size_t(lineLen), - size_t(nLines)); - detail::MatrixLinewiseOp<16, 256>::run( - out, in, lineLen, nLines, alongLines, op, stream, vecs...); -} +#pragma once -}; // end namespace matrix -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "matrix.cuh" diff --git a/cpp/include/raft/matrix/matrix_types.hpp b/cpp/include/raft/matrix/matrix_types.hpp new file mode 100644 index 0000000000..1f22154627 --- /dev/null +++ b/cpp/include/raft/matrix/matrix_types.hpp @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft::matrix { + +struct print_separators { + char horizontal = ' '; + char vertical = '\n'; +}; + +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/power.cuh b/cpp/include/raft/matrix/power.cuh new file mode 100644 index 0000000000..4e2b3b7d72 --- /dev/null +++ b/cpp/include/raft/matrix/power.cuh @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft::matrix { + +/** + * @brief Power of every element in the input matrix + * @tparam math_t type of matrix elements + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle + * @param[in] in: input matrix + * @param[out] out: output matrix. The result is stored in the out matrix + * @param[in] scalar: every element is multiplied with scalar. + */ +template +void weighted_power(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + math_t scalar) +{ + RAFT_EXPECTS(in.size() == out.size(), "Size of input and output matrices must be equal"); + detail::power(in.data_handle(), out.data_handle(), scalar, in.size(), handle.get_stream()); +} + +/** + * @brief Power of every element in the input matrix (inplace) + * @tparam math_t matrix element type + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle + * @param[inout] inout: input matrix and also the result is stored + * @param[in] scalar: every element is multiplied with scalar. + */ +template +void weighted_power(const raft::handle_t& handle, + raft::device_matrix_view inout, + math_t scalar) +{ + detail::power(inout.data_handle(), scalar, inout.size(), handle.get_stream()); +} + +/** + * @brief Power of every element in the input matrix (inplace) + * @tparam math_t matrix element type + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle + * @param[inout] inout: input matrix and also the result is stored + */ +template +void power(const raft::handle_t& handle, raft::device_matrix_view inout) +{ + detail::power(inout.data_handle(), inout.size(), handle.get_stream()); +} + +/** + * @brief Power of every element in the input matrix + * @tparam math_t type used for matrix elements + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix (row or column major) + * @param[in] handle: raft handle + * @param[in] in: input matrix + * @param[out] out: output matrix. The result is stored in the out matrix + * @{ + */ +template +void power(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out) +{ + RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must be same size."); + detail::power(in.data_handle(), out.data_handle(), in.size(), handle.get_stream()); +} + +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/print.cuh b/cpp/include/raft/matrix/print.cuh new file mode 100644 index 0000000000..4d3a8ca938 --- /dev/null +++ b/cpp/include/raft/matrix/print.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace raft::matrix { + +/** + * @brief Prints the data stored in GPU memory + * @tparam m_t type of matrix elements + * @tparam idx_t integer type used for indexing + * @param[in] handle: raft handle + * @param[in] in: input matrix + * @param[in] separators: horizontal and vertical separator characters + */ +template +void print(const raft::handle_t& handle, + raft::device_matrix_view in, + print_separators& separators) +{ + detail::print(in.data_handle(), + in.extent(0), + in.extent(1), + separators.horizontal, + separators.vertical, + handle.get_stream()); +} +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/print.hpp b/cpp/include/raft/matrix/print.hpp new file mode 100644 index 0000000000..86c314ed44 --- /dev/null +++ b/cpp/include/raft/matrix/print.hpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft::matrix { + +/** + * @brief Prints the data stored in CPU memory + * @param[in] in: input matrix with column-major layout + * @param[in] separators: horizontal and vertical separator characters + */ +template +void print(raft::host_matrix_view in, print_separators& separators) +{ + detail::printHost( + in.data_handle(), in.extent(0), in.extent(1), separators.horizontal, separators.vertical); +} +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/ratio.cuh b/cpp/include/raft/matrix/ratio.cuh new file mode 100644 index 0000000000..7895ea972f --- /dev/null +++ b/cpp/include/raft/matrix/ratio.cuh @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft::matrix { + +/** + * @brief ratio of every element over sum of input vector is calculated + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle + * @param[in] src: input matrix + * @param[out] dest: output matrix. The result is stored in the dest matrix + */ +template +void ratio(const raft::handle_t& handle, + raft::device_matrix_view src, + raft::device_matrix_view dest) +{ + RAFT_EXPECTS(src.size() == dest.size(), "Input and output matrices must be the same size."); + detail::ratio(handle, src.data_handle(), dest.data_handle(), src.size(), handle.get_stream()); +} + +/** + * @brief ratio of every element over sum of input vector is calculated + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle + * @param[inout] inout: input matrix + */ +template +void ratio(const raft::handle_t& handle, raft::device_matrix_view inout) +{ + detail::ratio( + handle, inout.data_handle(), inout.data_handle(), inout.size(), handle.get_stream()); +} +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/reciprocal.cuh b/cpp/include/raft/matrix/reciprocal.cuh new file mode 100644 index 0000000000..c41ecfb999 --- /dev/null +++ b/cpp/include/raft/matrix/reciprocal.cuh @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft::matrix { + +/** + * @brief Reciprocal of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @param handle: raft handle + * @param in: input matrix and also the result is stored + * @param out: output matrix. The result is stored in the out matrix + * @param scalar: every element is multiplied with scalar + * @param setzero round down to zero if the input is less the threshold + * @param thres the threshold used to forcibly set inputs to zero + * @{ + */ +template +void reciprocal(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + raft::host_scalar_view scalar, + bool setzero = false, + math_t thres = 1e-15) +{ + RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have the same size."); + detail::reciprocal(in.data_handle(), + out.data_handle(), + *(scalar.data_handle()), + in.size(), + handle.get_stream(), + setzero, + thres); +} + +/** + * @brief Reciprocal of every element in the input matrix (in place) + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle to manage resources + * @param[inout] inout: input matrix with in-place results + * @param[in] scalar: every element is multiplied with scalar + * @param[in] setzero round down to zero if the input is less the threshold + * @param[in] thres the threshold used to forcibly set inputs to zero + * @{ + */ +template +void reciprocal(const raft::handle_t& handle, + raft::device_matrix_view inout, + raft::host_scalar_view scalar, + bool setzero = false, + math_t thres = 1e-15) +{ + detail::reciprocal(inout.data_handle(), + *(scalar.data_handle()), + inout.size(), + handle.get_stream(), + setzero, + thres); +} +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/sign_flip.cuh b/cpp/include/raft/matrix/sign_flip.cuh new file mode 100644 index 0000000000..01f8829c85 --- /dev/null +++ b/cpp/include/raft/matrix/sign_flip.cuh @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft::matrix { + +/** + * @brief sign flip stabilizes the sign of col major eigen vectors. + * The sign is flipped if the column has negative |max|. + * @tparam math_t floating point type used for matrix elements + * @tparam idx_t integer type used for indexing + * @param[in] handle: raft handle + * @param[inout] inout: input matrix. Result also stored in this parameter + */ +template +void sign_flip(const raft::handle_t& handle, + raft::device_matrix_view inout) +{ + detail::signFlip(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream()); +} +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/sqrt.cuh b/cpp/include/raft/matrix/sqrt.cuh new file mode 100644 index 0000000000..302167480e --- /dev/null +++ b/cpp/include/raft/matrix/sqrt.cuh @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft::matrix { + +/** + * @brief Square root of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle + * @param[in] in: input matrix and also the result is stored + * @param[out] out: output matrix. The result is stored in the out matrix + */ +template +void sqrt(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out) +{ + RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have same size."); + detail::seqRoot(in.data_handle(), out.data_handle(), in.size(), handle.get_stream()); +} + +/** + * @brief Square root of every element in the input matrix (in place) + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle + * @param[inout] inout: input matrix with in-place results + */ +template +void sqrt(const raft::handle_t& handle, raft::device_matrix_view inout) +{ + detail::seqRoot(inout.data_handle(), inout.size(), handle.get_stream()); +} + +/** + * @brief Square root of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle + * @param[in] in: input matrix and also the result is stored + * @param[out] out: output matrix. The result is stored in the out matrix + * @param[in] scalar: every element is multiplied with scalar + * @param[in] set_neg_zero whether to set negative numbers to zero + */ +template +void weighted_sqrt(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + raft::host_scalar_view scalar, + bool set_neg_zero = false) +{ + RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have same size."); + detail::seqRoot(in.data_handle(), + out.data_handle(), + *(scalar.data_handle()), + in.size(), + handle.get_stream(), + set_neg_zero); +} + +/** + * @brief Square root of every element in the input matrix (in place) + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle + * @param[inout] inout: input matrix and also the result is stored + * @param[in] scalar: every element is multiplied with scalar + * @param[in] set_neg_zero whether to set negative numbers to zero + */ +template +void weighted_sqrt(const raft::handle_t& handle, + raft::device_matrix_view inout, + raft::host_scalar_view scalar, + bool set_neg_zero = false) +{ + detail::seqRoot( + inout.data_handle(), *(scalar.data_handle()), inout.size(), handle.get_stream(), set_neg_zero); +} + +} // namespace raft::matrix diff --git a/cpp/include/raft/matrix/threshold.cuh b/cpp/include/raft/matrix/threshold.cuh new file mode 100644 index 0000000000..7540ceb3c6 --- /dev/null +++ b/cpp/include/raft/matrix/threshold.cuh @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft::matrix { + +/** + * @brief sets the small values to zero based on a defined threshold + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param handle: raft handle + * @param[in] in: input matrix + * @param[out] out: output matrix. The result is stored in the out matrix + * @param[in] thres threshold to set values to zero + */ +template +void zero_small_values(const raft::handle_t& handle, + raft::device_matrix_view in, + raft::device_matrix_view out, + math_t thres = 1e-15) +{ + RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have same size"); + detail::setSmallValuesZero( + out.data_handle(), in.data_handle(), in.size(), handle.get_stream(), thres); +} + +/** + * @brief sets the small values to zero in-place based on a defined threshold + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t integer type used for indexing + * @tparam layout layout of the matrix data (must be row or col major) + * @param handle: raft handle + * @param inout: input matrix and also the result is stored + * @param thres: threshold + */ +template +void zero_small_values(const raft::handle_t& handle, + raft::device_matrix_view inout, + math_t thres = 1e-15) +{ + detail::setSmallValuesZero(inout.data_handle(), inout.size(), handle.get_stream(), thres); +} +} // namespace raft::matrix diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh index 93f81db1ac..f1ecabf0eb 100644 --- a/cpp/include/raft/pow2_utils.cuh +++ b/cpp/include/raft/pow2_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,152 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -#include "cuda_utils.cuh" - -namespace raft { - /** - * @brief Fast arithmetics and alignment checks for power-of-two values known at compile time. - * - * @tparam Value_ a compile-time value representable as a power-of-two. + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -template -struct Pow2 { - typedef decltype(Value_) Type; - static constexpr Type Value = Value_; - static constexpr Type Log2 = log2(Value); - static constexpr Type Mask = Value - 1; - - static_assert(std::is_integral::value, "Value must be integral."); - static_assert(Value && !(Value & Mask), "Value must be power of two."); - -#define Pow2_FUNC_QUALIFIER static constexpr __host__ __device__ __forceinline__ -#define Pow2_WHEN_INTEGRAL(I) std::enable_if_t -#define Pow2_IS_REPRESENTABLE_AS(I) (std::is_integral::value && Type(I(Value)) == Value) - - /** - * Integer division by Value truncated toward zero - * (same as `x / Value` in C++). - * - * Invariant: `x = Value * quot(x) + rem(x)` - */ - template - Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) quot(I x) noexcept - { - if constexpr (std::is_signed::value) return (x >> I(Log2)) + (x < 0 && (x & I(Mask))); - if constexpr (std::is_unsigned::value) return x >> I(Log2); - } - /** - * Remainder of integer division by Value truncated toward zero - * (same as `x % Value` in C++). - * - * Invariant: `x = Value * quot(x) + rem(x)`. - */ - template - Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) rem(I x) noexcept - { - if constexpr (std::is_signed::value) return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask)); - if constexpr (std::is_unsigned::value) return x & I(Mask); - } - - /** - * Integer division by Value truncated toward negative infinity - * (same as `x // Value` in Python). - * - * Invariant: `x = Value * div(x) + mod(x)`. - * - * Note, `div` and `mod` for negative values are slightly faster - * than `quot` and `rem`, but behave slightly different - * compared to normal C++ operators `/` and `%`. - */ - template - Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) div(I x) noexcept - { - return x >> I(Log2); - } - - /** - * x modulo Value operation (remainder of the `div(x)`) - * (same as `x % Value` in Python). - * - * Invariant: `mod(x) >= 0` - * Invariant: `x = Value * div(x) + mod(x)`. - * - * Note, `div` and `mod` for negative values are slightly faster - * than `quot` and `rem`, but behave slightly different - * compared to normal C++ operators `/` and `%`. - */ - template - Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) mod(I x) noexcept - { - return x & I(Mask); - } - -#define Pow2_CHECK_TYPE(T) \ - static_assert(std::is_pointer::value || std::is_integral::value, \ - "Only pointer or integral types make sense here") - - /** - * Tell whether the pointer or integral is Value-aligned. - * NB: for pointers, the alignment is checked in bytes, not in elements. - */ - template - Pow2_FUNC_QUALIFIER bool isAligned(PtrT p) noexcept - { - Pow2_CHECK_TYPE(PtrT); - if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return mod(p) == 0; - if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) return mod(reinterpret_cast(p)) == 0; - } - - /** Tell whether two pointers have the same address modulo Value. */ - template - Pow2_FUNC_QUALIFIER bool areSameAlignOffsets(PtrT a, PtrS b) noexcept - { - Pow2_CHECK_TYPE(PtrT); - Pow2_CHECK_TYPE(PtrS); - Type x, y; - if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) - x = Type(mod(a)); - else - x = mod(reinterpret_cast(a)); - if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrS)) - y = Type(mod(b)); - else - y = mod(reinterpret_cast(b)); - return x == y; - } +/** + * DISCLAIMER: this file is deprecated: use lap.cuh instead + */ - /** Get this or next Value-aligned address (in bytes) or integral. */ - template - Pow2_FUNC_QUALIFIER PtrT roundUp(PtrT p) noexcept - { - Pow2_CHECK_TYPE(PtrT); - if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return (p + PtrT(Mask)) & PtrT(~Mask); - if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) { - auto x = reinterpret_cast(p); - return reinterpret_cast((x + Mask) & (~Mask)); - } - } +#pragma once - /** Get this or previous Value-aligned address (in bytes) or integral. */ - template - Pow2_FUNC_QUALIFIER PtrT roundDown(PtrT p) noexcept - { - Pow2_CHECK_TYPE(PtrT); - if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return p & PtrT(~Mask); - if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) { - auto x = reinterpret_cast(p); - return reinterpret_cast(x & (~Mask)); - } - } -#undef Pow2_CHECK_TYPE -#undef Pow2_IS_REPRESENTABLE_AS -#undef Pow2_FUNC_QUALIFIER -#undef Pow2_WHEN_INTEGRAL -}; +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/util version instead.") -}; // namespace raft +#include diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh index f214abce58..212245a9bf 100644 --- a/cpp/include/raft/random/detail/make_blobs.cuh +++ b/cpp/include/raft/random/detail/make_blobs.cuh @@ -17,11 +17,11 @@ #pragma once #include "permute.cuh" -#include -#include #include #include #include +#include +#include #include #include diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh index 5556abb8e8..f06e20d4a6 100644 --- a/cpp/include/raft/random/detail/make_regression.cuh +++ b/cpp/include/raft/random/detail/make_regression.cuh @@ -22,8 +22,7 @@ #include -#include -#include +#include #include #include #include @@ -32,6 +31,7 @@ #include #include #include +#include #include namespace raft::random { diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh index 15789742fd..2d19773c3b 100644 --- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh +++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh @@ -16,15 +16,21 @@ #pragma once #include "curand_wrappers.hpp" +#include "random_types.hpp" #include -#include -#include -#include +#include +#include +#include +#include #include #include #include #include +#include +#include +#include #include +#include // mvg.cuh takes in matrices that are colomn major (as in fortan) #define IDX2C(i, j, ld) (j * ld + i) @@ -286,5 +292,157 @@ class multi_variable_gaussian_impl { ~multi_variable_gaussian_impl() { deinit(); } }; // end of multi_variable_gaussian_impl +template +class multi_variable_gaussian_setup_token; + +template +multi_variable_gaussian_setup_token build_multi_variable_gaussian_token_impl( + const raft::handle_t& handle, + rmm::mr::device_memory_resource& mem_resource, + const int dim, + const multi_variable_gaussian_decomposition_method method); + +template +void compute_multi_variable_gaussian_impl( + multi_variable_gaussian_setup_token& token, + std::optional> x, + raft::device_matrix_view P, + raft::device_matrix_view X); + +template +class multi_variable_gaussian_setup_token { + template + friend multi_variable_gaussian_setup_token build_multi_variable_gaussian_token_impl( + const raft::handle_t& handle, + rmm::mr::device_memory_resource& mem_resource, + const int dim, + const multi_variable_gaussian_decomposition_method method); + + template + friend void compute_multi_variable_gaussian_impl( + multi_variable_gaussian_setup_token& token, + std::optional> x, + raft::device_matrix_view P, + raft::device_matrix_view X); + + private: + typename multi_variable_gaussian_impl::Decomposer new_enum_to_old_enum( + multi_variable_gaussian_decomposition_method method) + { + if (method == multi_variable_gaussian_decomposition_method::CHOLESKY) { + return multi_variable_gaussian_impl::chol_decomp; + } else if (method == multi_variable_gaussian_decomposition_method::JACOBI) { + return multi_variable_gaussian_impl::jacobi; + } else { + return multi_variable_gaussian_impl::qr; + } + } + + // Constructor, only for use by friend functions. + // Hiding this will let us change the implementation in the future. + multi_variable_gaussian_setup_token(const raft::handle_t& handle, + rmm::mr::device_memory_resource& mem_resource, + const int dim, + const multi_variable_gaussian_decomposition_method method) + : impl_(std::make_unique>( + handle, dim, new_enum_to_old_enum(method))), + handle_(handle), + mem_resource_(mem_resource), + dim_(dim) + { + } + + /** + * @brief Compute the multivariable Gaussian. + * + * @param[in] x vector of dim elements + * @param[inout] P On input, dim x dim matrix; overwritten on output + * @param[out] X dim x nPoints matrix + */ + void compute(std::optional> x, + raft::device_matrix_view P, + raft::device_matrix_view X) + { + const int input_dim = P.extent(0); + RAFT_EXPECTS(input_dim == dim(), + "multi_variable_gaussian: " + "P.extent(0) = %d does not match the extent %d " + "with which the token was created", + input_dim, + dim()); + RAFT_EXPECTS(P.extent(0) == P.extent(1), + "multi_variable_gaussian: " + "P must be square, but P.extent(0) = %d != P.extent(1) = %d", + P.extent(0), + P.extent(1)); + RAFT_EXPECTS(P.extent(0) == X.extent(0), + "multi_variable_gaussian: " + "P.extent(0) = %d != X.extent(0) = %d", + P.extent(0), + X.extent(0)); + const bool x_has_value = x.has_value(); + const int x_extent_0 = x_has_value ? (*x).extent(0) : 0; + RAFT_EXPECTS(not x_has_value || P.extent(0) == x_extent_0, + "multi_variable_gaussian: " + "P.extent(0) = %d != x.extent(0) = %d", + P.extent(0), + x_extent_0); + const int nPoints = X.extent(1); + const ValueType* x_ptr = x_has_value ? (*x).data_handle() : nullptr; + + auto workspace = allocate_workspace(); + impl_->set_workspace(workspace.data()); + impl_->give_gaussian(nPoints, P.data_handle(), X.data_handle(), x_ptr); + } + + private: + std::unique_ptr> impl_; + const raft::handle_t& handle_; + rmm::mr::device_memory_resource& mem_resource_; + int dim_ = 0; + + auto allocate_workspace() const + { + const auto num_elements = impl_->get_workspace_size(); + return rmm::device_uvector{num_elements, handle_.get_stream(), &mem_resource_}; + } + + int dim() const { return dim_; } +}; + +template +multi_variable_gaussian_setup_token build_multi_variable_gaussian_token_impl( + const raft::handle_t& handle, + rmm::mr::device_memory_resource& mem_resource, + const int dim, + const multi_variable_gaussian_decomposition_method method) +{ + return multi_variable_gaussian_setup_token(handle, mem_resource, dim, method); +} + +template +void compute_multi_variable_gaussian_impl( + multi_variable_gaussian_setup_token& token, + std::optional> x, + raft::device_matrix_view P, + raft::device_matrix_view X) +{ + token.compute(x, P, X); +} + +template +void compute_multi_variable_gaussian_impl( + const raft::handle_t& handle, + rmm::mr::device_memory_resource& mem_resource, + std::optional> x, + raft::device_matrix_view P, + raft::device_matrix_view X, + const multi_variable_gaussian_decomposition_method method) +{ + auto token = + build_multi_variable_gaussian_token_impl(handle, mem_resource, P.extent(0), method); + compute_multi_variable_gaussian_impl(token, x, P, X); +} + }; // end of namespace detail -}; // end of namespace raft::random \ No newline at end of file +}; // end of namespace raft::random diff --git a/cpp/include/raft/random/detail/permute.cuh b/cpp/include/raft/random/detail/permute.cuh index 28eaf9136c..9582f69e34 100644 --- a/cpp/include/raft/random/detail/permute.cuh +++ b/cpp/include/raft/random/detail/permute.cuh @@ -18,9 +18,9 @@ #include #include -#include -#include -#include +#include +#include +#include namespace raft::random { namespace detail { diff --git a/cpp/include/raft/random/detail/random_types.hpp b/cpp/include/raft/random/detail/random_types.hpp new file mode 100644 index 0000000000..28108f9513 --- /dev/null +++ b/cpp/include/raft/random/detail/random_types.hpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft::random::detail { + +enum class multi_variable_gaussian_decomposition_method { CHOLESKY, JACOBI, QR }; + +}; // end of namespace raft::random::detail diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh index 8a1f23e785..5ce7e909ee 100644 --- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh +++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh @@ -16,10 +16,13 @@ #pragma once -#include -#include +#include "rmat_rectangular_generator_types.cuh" + +#include #include #include +#include +#include namespace raft { namespace random { @@ -182,6 +185,111 @@ void rmat_rectangular_gen_caller(IdxT* out, r.advance(n_edges, max_scale); } +/** + * @brief Implementation of `raft::random::rmat_rectangular_gen_impl`. + * + * @tparam IdxT type of each node index + * @tparam ProbT data type used for probability distributions (either fp32 or fp64) + * @param[in] handle RAFT handle, containing the CUDA stream on which to schedule work + * @param[in] r underlying state of the random generator. Especially useful when + * one wants to call this API for multiple times in order to generate + * a larger graph. For that case, just create this object with the + * initial seed once and after every call continue to pass the same + * object for the successive calls. + * @param[out] output Encapsulation of one, two, or three output vectors. + * @param[in] theta distribution of each quadrant at each level of resolution. + * Since these are probabilities, each of the 2x2 matrices for + * each level of the RMAT must sum to one. [on device] + * [dim = max(r_scale, c_scale) x 2 x 2]. Of course, it is assumed + * that each of the group of 2 x 2 numbers all sum up to 1. + * @param[in] r_scale 2^r_scale represents the number of source nodes + * @param[in] c_scale 2^c_scale represents the number of destination nodes + */ +template +void rmat_rectangular_gen_impl(const raft::handle_t& handle, + raft::random::RngState& r, + raft::device_vector_view theta, + raft::random::detail::rmat_rectangular_gen_output output, + IdxT r_scale, + IdxT c_scale) +{ + static_assert(std::is_integral_v, + "rmat_rectangular_gen: " + "Template parameter IdxT must be an integral type"); + if (output.empty()) { + return; // nothing to do; not an error + } + + const IdxT expected_theta_len = IdxT(4) * (r_scale >= c_scale ? r_scale : c_scale); + RAFT_EXPECTS(theta.extent(0) == expected_theta_len, + "rmat_rectangular_gen: " + "theta.extent(0) = %zu != 2 * 2 * max(r_scale = %zu, c_scale = %zu) = %zu", + static_cast(theta.extent(0)), + static_cast(r_scale), + static_cast(c_scale), + static_cast(expected_theta_len)); + + auto out = output.out_view(); + auto out_src = output.out_src_view(); + auto out_dst = output.out_dst_view(); + const bool out_has_value = out.has_value(); + const bool out_src_has_value = out_src.has_value(); + const bool out_dst_has_value = out_dst.has_value(); + IdxT* out_ptr = out_has_value ? (*out).data_handle() : nullptr; + IdxT* out_src_ptr = out_src_has_value ? (*out_src).data_handle() : nullptr; + IdxT* out_dst_ptr = out_dst_has_value ? (*out_dst).data_handle() : nullptr; + const IdxT n_edges = output.number_of_edges(); + + rmat_rectangular_gen_caller(out_ptr, + out_src_ptr, + out_dst_ptr, + theta.data_handle(), + r_scale, + c_scale, + n_edges, + handle.get_stream(), + r); +} + +/** + * @brief Overload of `rmat_rectangular_gen` that assumes the same + * a, b, c, d probability distributions across all the scales. + * + * `a`, `b, and `c` effectively replace the above overload's + * `theta` parameter. + */ +template +void rmat_rectangular_gen_impl(const raft::handle_t& handle, + raft::random::RngState& r, + raft::random::detail::rmat_rectangular_gen_output output, + ProbT a, + ProbT b, + ProbT c, + IdxT r_scale, + IdxT c_scale) +{ + static_assert(std::is_integral_v, + "rmat_rectangular_gen: " + "Template parameter IdxT must be an integral type"); + if (output.empty()) { + return; // nothing to do; not an error + } + + auto out = output.out_view(); + auto out_src = output.out_src_view(); + auto out_dst = output.out_dst_view(); + const bool out_has_value = out.has_value(); + const bool out_src_has_value = out_src.has_value(); + const bool out_dst_has_value = out_dst.has_value(); + IdxT* out_ptr = out_has_value ? (*out).data_handle() : nullptr; + IdxT* out_src_ptr = out_src_has_value ? (*out_src).data_handle() : nullptr; + IdxT* out_dst_ptr = out_dst_has_value ? (*out_dst).data_handle() : nullptr; + const IdxT n_edges = output.number_of_edges(); + + detail::rmat_rectangular_gen_caller( + out_ptr, out_src_ptr, out_dst_ptr, a, b, c, r_scale, c_scale, n_edges, handle.get_stream(), r); +} + } // end namespace detail } // end namespace random } // end namespace raft diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator_types.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator_types.cuh new file mode 100644 index 0000000000..daf3392f3d --- /dev/null +++ b/cpp/include/raft/random/detail/rmat_rectangular_generator_types.cuh @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace raft { +namespace random { +namespace detail { + +/** + * @brief Implementation detail for checking output vector parameter(s) + * of `raft::random::rmat_rectangular_gen`. + * + * `raft::random::rmat_rectangular_gen` lets users specify + * output vector(s) in three different ways. + * + * 1. One vector: `out`, an "array-of-structs" representation + * of the edge list. + * + * 2. Two vectors: `out_src` and `out_dst`, together forming + * a "struct of arrays" representation of the edge list. + * + * 3. Three vectors: `out`, `out_src`, and `out_dst`. + * `out` is as in (1), + * and `out_src` and `out_dst` are as in (2). + * + * This class prevents users from doing anything other than that, + * and makes it easier for the three cases to share a common implementation. + * It also prevents duplication of run-time vector length checking + * (`out` must have twice the number of elements as `out_src` and `out_dst`, + * and `out_src` and `out_dst` must have the same length). + * + * @tparam IdxT Type of each node index; must be integral. + * + * The following examples show how to create an output parameter. + * + * @code + * rmat_rectangular_gen_output output1(out); + * rmat_rectangular_gen_output output2(out_src, out_dst); + * rmat_rectangular_gen_output output3(out, out_src, out_dst); + * @endcode + */ +template +class rmat_rectangular_gen_output { + public: + using out_view_type = + raft::device_mdspan, raft::row_major>; + using out_src_view_type = raft::device_vector_view; + using out_dst_view_type = raft::device_vector_view; + + private: + class output_pair { + public: + output_pair(const out_src_view_type& src, const out_dst_view_type& dst) : src_(src), dst_(dst) + { + RAFT_EXPECTS(src.extent(0) == dst.extent(0), + "rmat_rectangular_gen: " + "out_src.extent(0) = %zu != out_dst.extent(0) = %zu", + static_cast(src.extent(0)), + static_cast(dst.extent(0))); + } + + out_src_view_type out_src_view() const { return src_; } + + out_dst_view_type out_dst_view() const { return dst_; } + + IdxT number_of_edges() const { return src_.extent(0); } + + bool empty() const { return src_.extent(0) == 0 && dst_.extent(0) == 0; } + + private: + out_src_view_type src_; + out_dst_view_type dst_; + }; + + class output_triple { + public: + output_triple(const out_view_type& out, + const out_src_view_type& src, + const out_dst_view_type& dst) + : out_(out), pair_(src, dst) + { + RAFT_EXPECTS(out.extent(0) == IdxT(2) * dst.extent(0), + "rmat_rectangular_gen: " + "out.extent(0) = %zu != 2 * out_dst.extent(0) = %zu", + static_cast(out.extent(0)), + static_cast(IdxT(2) * dst.extent(0))); + } + + out_view_type out_view() const { return out_; } + + out_src_view_type out_src_view() const { return pair_.out_src_view(); } + + out_dst_view_type out_dst_view() const { return pair_.out_dst_view(); } + + IdxT number_of_edges() const { return pair_.number_of_edges(); } + + bool empty() const { return out_.extent(0) == 0 && pair_.empty(); } + + private: + out_view_type out_; + output_pair pair_; + }; + + public: + /** + * @brief You're not allowed to construct this with no vectors. + */ + rmat_rectangular_gen_output() = delete; + + /** + * @brief Constructor taking a single vector, that packs the source + * node ids and destination node ids in array-of-structs fashion. + * + * @param[out] out Generated edgelist [on device]. In each row, the + * first element is the source node id, and the second element is + * the destination node id. + */ + rmat_rectangular_gen_output(const out_view_type& out) : data_(out) {} + + /** + * @brief Constructor taking two vectors, that store the source node + * ids and the destination node ids separately, in + * struct-of-arrays fashion. + * + * @param[out] out_src Source node id's [on device] [len = n_edges]. + * + * @param[out] out_dst Destination node id's [on device] [len = n_edges]. + */ + rmat_rectangular_gen_output(const out_src_view_type& src, const out_dst_view_type& dst) + : data_(output_pair(src, dst)) + { + } + + /** + * @brief Constructor taking all three vectors. + * + * @param[out] out Generated edgelist [on device]. In each row, the + * first element is the source node id, and the second element is + * the destination node id. + * + * @param[out] out_src Source node id's [on device] [len = n_edges]. + * + * @param[out] out_dst Destination node id's [on device] [len = n_edges]. + */ + rmat_rectangular_gen_output(const out_view_type& out, + const out_src_view_type& src, + const out_dst_view_type& dst) + : data_(output_triple(out, src, dst)) + { + } + + /** + * @brief Whether the vector(s) are all length zero. + */ + bool empty() const + { + if (std::holds_alternative(data_)) { + return std::get(data_).extent(0) == 0; + } else if (std::holds_alternative(data_)) { + return std::get(data_).empty(); + } else { // std::holds_alternative(data_) + return std::get(data_).empty(); + } + } + + /** + * @brief Vector for the output single edgelist; the argument given + * to the one-argument constructor, or the first argument of the + * three-argument constructor; `std::nullopt` if not provided. + */ + std::optional out_view() const + { + if (std::holds_alternative(data_)) { + return std::get(data_); + } else if (std::holds_alternative(data_)) { + return std::get(data_).out_view(); + } else { // if (std::holds_alternative<>(output_pair)) + return std::nullopt; + } + } + + /** + * @brief Vector for the output source edgelist; the first argument + * given to the two-argument constructor, or the second argument + * of the three-argument constructor; `std::nullopt` if not provided. + */ + std::optional out_src_view() const + { + if (std::holds_alternative(data_)) { + return std::get(data_).out_src_view(); + } else if (std::holds_alternative(data_)) { + return std::get(data_).out_src_view(); + } else { // if (std::holds_alternative(data_)) + return std::nullopt; + } + } + + /** + * @brief Vector for the output destination edgelist; the second + * argument given to the two-argument constructor, or the third + * argument of the three-argument constructor; + * `std::nullopt` if not provided. + */ + std::optional out_dst_view() const + { + if (std::holds_alternative(data_)) { + return std::get(data_).out_dst_view(); + } else if (std::holds_alternative(data_)) { + return std::get(data_).out_dst_view(); + } else { // if (std::holds_alternative(data_)) + return std::nullopt; + } + } + + /** + * @brief Number of edges in the graph; zero if no output vector + * was provided to the constructor. + */ + IdxT number_of_edges() const + { + if (std::holds_alternative(data_)) { + return std::get(data_).extent(0); + } else if (std::holds_alternative(data_)) { + return std::get(data_).number_of_edges(); + } else { // if (std::holds_alternative(data_)) + return std::get(data_).number_of_edges(); + } + } + + private: + std::variant data_; +}; + +} // end namespace detail +} // end namespace random +} // end namespace raft diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh index f1e3389924..8f0bf9fe53 100644 --- a/cpp/include/raft/random/detail/rng_device.cuh +++ b/cpp/include/raft/random/detail/rng_device.cuh @@ -16,8 +16,8 @@ #pragma once -#include #include +#include #include diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh index eead64942f..5aecbfcaa2 100644 --- a/cpp/include/raft/random/detail/rng_impl.cuh +++ b/cpp/include/raft/random/detail/rng_impl.cuh @@ -16,26 +16,17 @@ #pragma once -#include -#include -#include +#include #include #include +#include +#include +#include namespace raft { namespace random { namespace detail { -/** - * Some macro magic to remove optional parentheses of a macro argument. - * See https://stackoverflow.com/a/62984543 - */ -#define RAFT_DEPAREN(X) RAFT_DEPAREN_H2(RAFT_DEPAREN_H1 X) -#define RAFT_DEPAREN_H1(...) RAFT_DEPAREN_H1 __VA_ARGS__ -#define RAFT_DEPAREN_H2(...) RAFT_DEPAREN_H3(__VA_ARGS__) -#define RAFT_DEPAREN_H3(...) RAFT_DEPAREN_MAGIC##__VA_ARGS__ -#define RAFT_DEPAREN_MAGICRAFT_DEPAREN_H1 - /** * This macro will invoke function `func` with the correct instantiation of * device state as the first parameter, and passes all subsequent macro diff --git a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh index 29af59d502..f9b55dd9d0 100644 --- a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh +++ b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh @@ -23,11 +23,11 @@ #include "rng_device.cuh" #include -#include -#include -#include -#include +#include #include +#include +#include +#include #include #include diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh index 8bd78d98eb..82c940b471 100644 --- a/cpp/include/raft/random/make_blobs.cuh +++ b/cpp/include/raft/random/make_blobs.cuh @@ -21,7 +21,7 @@ #include "detail/make_blobs.cuh" #include -#include +#include namespace raft::random { diff --git a/cpp/include/raft/random/make_regression.cuh b/cpp/include/raft/random/make_regression.cuh index 4fbb48fa35..c575ea987c 100644 --- a/cpp/include/raft/random/make_regression.cuh +++ b/cpp/include/raft/random/make_regression.cuh @@ -24,6 +24,8 @@ #pragma once #include +#include +#include #include "detail/make_regression.cuh" @@ -58,7 +60,7 @@ namespace raft::random { * @param[in] tail_strength The relative importance of the fat noisy tail * of the singular values profile if * effective_rank is not -1 - * @param[in] noise Standard deviation of the gaussian noise + * @param[in] noise Standard deviation of the Gaussian noise * applied to the output * @param[in] shuffle Shuffle the samples and the features * @param[in] seed Seed for the random number generator @@ -100,6 +102,81 @@ void make_regression(const raft::handle_t& handle, type); } +/** + * @brief GPU-equivalent of sklearn.datasets.make_regression as documented at: + * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html + * + * @tparam DataT Scalar type + * @tparam IdxT Index type + * + * @param[in] handle RAFT handle + * @param[out] out Row-major (samples, features) matrix to store + * the problem data + * @param[out] values Row-major (samples, targets) matrix to store + * the values for the regression problem + * @param[in] n_informative Number of informative features (non-zero + * coefficients) + * @param[out] coef If present, a row-major (features, targets) matrix + * to store the coefficients used to generate the values + * for the regression problem + * @param[in] bias A scalar that will be added to the values + * @param[in] effective_rank The approximate rank of the data matrix (used + * to create correlations in the data). -1 is the + * code to use well-conditioned data + * @param[in] tail_strength The relative importance of the fat noisy tail + * of the singular values profile if + * effective_rank is not -1 + * @param[in] noise Standard deviation of the Gaussian noise + * applied to the output + * @param[in] shuffle Shuffle the samples and the features + * @param[in] seed Seed for the random number generator + * @param[in] type Random generator type + */ +template +void make_regression(const raft::handle_t& handle, + raft::device_matrix_view out, + raft::device_matrix_view values, + IdxT n_informative, + std::optional> coef, + DataT bias = DataT{}, + IdxT effective_rank = static_cast(-1), + DataT tail_strength = DataT{0.5}, + DataT noise = DataT{}, + bool shuffle = true, + uint64_t seed = 0ULL, + GeneratorType type = GenPhilox) +{ + const auto n_samples = out.extent(0); + assert(values.extent(0) == n_samples); + const auto n_features = out.extent(1); + const auto n_targets = values.extent(1); + + const bool have_coef = coef.has_value(); + if (have_coef) { + const auto coef_ref = *coef; + assert(coef_ref.extent(0) == n_features); + assert(coef_ref.extent(1) == n_targets); + } + DataT* coef_ptr = have_coef ? (*coef).data_handle() : nullptr; + + detail::make_regression_caller(handle, + out.data_handle(), + values.data_handle(), + n_samples, + n_features, + n_informative, + handle.get_stream(), + coef_ptr, + n_targets, + bias, + effective_rank, + tail_strength, + noise, + shuffle, + seed, + type); +} + } // namespace raft::random -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/random/make_regression.hpp b/cpp/include/raft/random/make_regression.hpp index 4f6b2717f6..f3e2113f80 100644 --- a/cpp/include/raft/random/make_regression.hpp +++ b/cpp/include/raft/random/make_regression.hpp @@ -13,98 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -/* Adapted from scikit-learn - * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py - */ - /** * This file is deprecated and will be removed in release 22.06. * Please use the cuh version instead. */ -#ifndef __MAKE_REGRESSION_H -#define __MAKE_REGRESSION_H - -#pragma once - -#include - -#include "detail/make_regression.cuh" - -namespace raft::random { - /** - * @brief GPU-equivalent of sklearn.datasets.make_regression as documented at: - * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html - * - * @tparam DataT Scalar type - * @tparam IdxT Index type - * - * @param[in] handle RAFT handle - * @param[out] out Row-major (samples, features) matrix to store - * the problem data - * @param[out] values Row-major (samples, targets) matrix to store - * the values for the regression problem - * @param[in] n_rows Number of samples - * @param[in] n_cols Number of features - * @param[in] n_informative Number of informative features (non-zero - * coefficients) - * @param[in] stream CUDA stream - * @param[out] coef Row-major (features, targets) matrix to store - * the coefficients used to generate the values - * for the regression problem. If nullptr is - * given, nothing will be written - * @param[in] n_targets Number of targets (generated values per sample) - * @param[in] bias A scalar that will be added to the values - * @param[in] effective_rank The approximate rank of the data matrix (used - * to create correlations in the data). -1 is the - * code to use well-conditioned data - * @param[in] tail_strength The relative importance of the fat noisy tail - * of the singular values profile if - * effective_rank is not -1 - * @param[in] noise Standard deviation of the gaussian noise - * applied to the output - * @param[in] shuffle Shuffle the samples and the features - * @param[in] seed Seed for the random number generator - * @param[in] type Random generator type + * DISCLAIMER: this file is deprecated: use make_regression.cuh instead */ -template -void make_regression(const raft::handle_t& handle, - DataT* out, - DataT* values, - IdxT n_rows, - IdxT n_cols, - IdxT n_informative, - cudaStream_t stream, - DataT* coef = nullptr, - IdxT n_targets = (IdxT)1, - DataT bias = (DataT)0.0, - IdxT effective_rank = (IdxT)-1, - DataT tail_strength = (DataT)0.5, - DataT noise = (DataT)0.0, - bool shuffle = true, - uint64_t seed = 0ULL, - GeneratorType type = GenPhilox) -{ - detail::make_regression_caller(handle, - out, - values, - n_rows, - n_cols, - n_informative, - stream, - coef, - n_targets, - bias, - effective_rank, - tail_strength, - noise, - shuffle, - seed, - type); -} -} // namespace raft::random +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "make_regression.cuh" diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh index 1d9d63f6c5..796a10fb65 100644 --- a/cpp/include/raft/random/multi_variable_gaussian.cuh +++ b/cpp/include/raft/random/multi_variable_gaussian.cuh @@ -59,6 +59,52 @@ class multi_variable_gaussian : public detail::multi_variable_gaussian_impl { ~multi_variable_gaussian() { deinit(); } }; // end of multi_variable_gaussian +/** + * @brief Matrix decomposition method for `compute_multi_variable_gaussian` to use. + * + * `compute_multi_variable_gaussian` can use any of the following methods. + * + * - `CHOLESKY`: Uses Cholesky decomposition on the normal equations. + * This may be faster than the other two methods, but less accurate. + * + * - `JACOBI`: Uses the singular value decomposition (SVD) computed with + * cuSOLVER's gesvdj algorithm, which is based on the Jacobi method + * (sweeps of plane rotations). This exposes more parallelism + * for small and medium size matrices than the QR option below. + * + * - `QR`: Uses the SVD computed with cuSOLVER's gesvd algorithm, + * which is based on the QR algortihm. + */ +using detail::multi_variable_gaussian_decomposition_method; + +template +void compute_multi_variable_gaussian( + const raft::handle_t& handle, + rmm::mr::device_memory_resource& mem_resource, + std::optional> x, + raft::device_matrix_view P, + raft::device_matrix_view X, + const multi_variable_gaussian_decomposition_method method) +{ + detail::compute_multi_variable_gaussian_impl(handle, mem_resource, x, P, X, method); +} + +template +void compute_multi_variable_gaussian( + const raft::handle_t& handle, + std::optional> x, + raft::device_matrix_view P, + raft::device_matrix_view X, + const multi_variable_gaussian_decomposition_method method) +{ + rmm::mr::device_memory_resource* mem_resource_ptr = rmm::mr::get_current_device_resource(); + RAFT_EXPECTS(mem_resource_ptr != nullptr, + "compute_multi_variable_gaussian: " + "rmm::mr::get_current_device_resource() returned null; " + "please report this bug to the RAPIDS RAFT developers."); + detail::compute_multi_variable_gaussian_impl(handle, *mem_resource_ptr, x, P, X, method); +} + }; // end of namespace raft::random -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/random/multi_variable_gaussian.hpp b/cpp/include/raft/random/multi_variable_gaussian.hpp index 6b85ec6a14..e7d78938a2 100644 --- a/cpp/include/raft/random/multi_variable_gaussian.hpp +++ b/cpp/include/raft/random/multi_variable_gaussian.hpp @@ -18,51 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __MVG_H -#define __MVG_H +/** + * DISCLAIMER: this file is deprecated: use multi_variable_gaussian.cuh instead + */ #pragma once -#include "detail/multi_variable_gaussian.cuh" - -namespace raft::random { - -template -class multi_variable_gaussian : public detail::multi_variable_gaussian_impl { - public: - // using Decomposer = typename detail::multi_variable_gaussian_impl::Decomposer; - // using detail::multi_variable_gaussian_impl::Decomposer::chol_decomp; - // using detail::multi_variable_gaussian_impl::Decomposer::jacobi; - // using detail::multi_variable_gaussian_impl::Decomposer::qr; - - multi_variable_gaussian() = delete; - multi_variable_gaussian(const raft::handle_t& handle, - const int dim, - typename detail::multi_variable_gaussian_impl::Decomposer method) - : detail::multi_variable_gaussian_impl{handle, dim, method} - { - } - - std::size_t get_workspace_size() - { - return detail::multi_variable_gaussian_impl::get_workspace_size(); - } - - void set_workspace(T* workarea) - { - detail::multi_variable_gaussian_impl::set_workspace(workarea); - } - - void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0) - { - detail::multi_variable_gaussian_impl::give_gaussian(nPoints, P, X, x); - } - - void deinit() { detail::multi_variable_gaussian_impl::deinit(); } - - ~multi_variable_gaussian() { deinit(); } -}; // end of multi_variable_gaussian - -}; // end of namespace raft::random +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "multi_variable_gaussian.cuh" diff --git a/cpp/include/raft/random/permute.cuh b/cpp/include/raft/random/permute.cuh index 1c01d589f4..17b103fab6 100644 --- a/cpp/include/raft/random/permute.cuh +++ b/cpp/include/raft/random/permute.cuh @@ -21,30 +21,163 @@ #include "detail/permute.cuh" +#include +#include +#include +#include + namespace raft::random { /** - * @brief Generate permutations of the input array. Pretty useful primitive for - * shuffling the input datasets in ML algos. See note at the end for some of its - * limitations! - * @tparam Type Data type of the array to be shuffled - * @tparam IntType Integer type used for ther perms array - * @tparam IdxType Integer type used for addressing indices - * @tparam TPB threads per block - * @param perms the output permutation indices. Typically useful only when - * one wants to refer back. If you don't need this, pass a nullptr - * @param out the output shuffled array. Pass nullptr if you don't want this to - * be written. For eg: when you only want the perms array to be filled. - * @param in input array (in-place is not supported due to race conditions!) - * @param D number of columns of the input array - * @param N length of the input array (or number of rows) - * @param rowMajor whether the input/output matrices are row or col major - * @param stream cuda stream where to launch the work - * - * @note This is NOT a uniform permutation generator! In fact, it only generates - * very small percentage of permutations. If your application really requires a - * high quality permutation generator, it is recommended that you pick - * Knuth Shuffle. + * @brief Randomly permute the rows of the input matrix. + * + * We do not support in-place permutation, so that we can compute + * in parallel without race conditions. This function is useful + * for shuffling input data sets in machine learning algorithms. + * + * @tparam InputOutputValueType Type of each element of the input matrix, + * and the type of each element of the output matrix (if provided) + * @tparam IntType Integer type of each element of `permsOut` + * @tparam IdxType Integer type of the extents of the mdspan parameters + * @tparam Layout Either `raft::row_major` or `raft::col_major` + * + * @param[in] handle RAFT handle containing the CUDA stream + * on which to run. + * @param[in] in input matrix + * @param[out] permsOut If provided, the indices of the permutation. + * @param[out] out If provided, the output matrix, containing the + * permuted rows of the input matrix `in`. (Not providing this + * is only useful if you provide `permsOut`.) + * + * @pre If `permsOut.has_value()` is `true`, + * then `(*permsOut).extent(0) == in.extent(0)` is `true`. + * + * @pre If `out.has_value()` is `true`, + * then `(*out).extents() == in.extents()` is `true`. + * + * @note This is NOT a uniform permutation generator! + * It only generates a small fraction of all possible random permutations. + * If your application needs a high-quality permutation generator, + * then we recommend Knuth Shuffle. + */ +template +void permute(const raft::handle_t& handle, + raft::device_matrix_view in, + std::optional> permsOut, + std::optional> out) +{ + static_assert(std::is_integral_v, + "permute: The type of each element " + "of permsOut (if provided) must be an integral type."); + static_assert(std::is_integral_v, + "permute: The index type " + "of each mdspan argument must be an integral type."); + constexpr bool is_row_major = std::is_same_v; + constexpr bool is_col_major = std::is_same_v; + static_assert(is_row_major || is_col_major, + "permute: Layout must be either " + "raft::row_major or raft::col_major (or one of their aliases)"); + + const bool permsOut_has_value = permsOut.has_value(); + const bool out_has_value = out.has_value(); + + RAFT_EXPECTS(!permsOut_has_value || (*permsOut).extent(0) == in.extent(0), + "permute: If 'permsOut' is provided, then its extent(0) " + "must equal the number of rows of the input matrix 'in'."); + RAFT_EXPECTS(!out_has_value || (*out).extents() == in.extents(), + "permute: If 'out' is provided, then both its extents " + "must match the extents of the input matrix 'in'."); + + IntType* permsOut_ptr = permsOut_has_value ? (*permsOut).data_handle() : nullptr; + InputOutputValueType* out_ptr = out_has_value ? (*out).data_handle() : nullptr; + + if (permsOut_ptr != nullptr || out_ptr != nullptr) { + const IdxType N = in.extent(0); + const IdxType D = in.extent(1); + detail::permute( + permsOut_ptr, out_ptr, in.data_handle(), D, N, is_row_major, handle.get_stream()); + } +} + +namespace permute_impl { + +template +struct perms_out_view { +}; + +template +struct perms_out_view { + // permsOut won't have a value anyway, + // so we can pick any integral value type we want. + using type = raft::device_vector_view; +}; + +template +struct perms_out_view>, + InputOutputValueType, + IdxType, + Layout> { + using type = raft::device_vector_view; +}; + +template +using perms_out_view_t = typename perms_out_view::type; + +} // namespace permute_impl + +/** + * @brief Overload of `permute` that compiles if users pass in `std::nullopt` + * for either or both of `permsOut` and `out`. + */ +template +void permute(const raft::handle_t& handle, + raft::device_matrix_view in, + PermsOutType&& permsOut, + OutType&& out) +{ + // If PermsOutType is std::optional> + // for some T, then that type T need not be related to any of the + // other template parameters. Thus, we have to deduce it specially. + using perms_out_view_type = permute_impl:: + perms_out_view_t, InputOutputValueType, IdxType, Layout>; + using out_view_type = raft::device_matrix_view; + + static_assert(std::is_same_v, std::nullopt_t> || + std::is_same_v, std::optional>, + "permute: The type of 'out' must be either std::optional<" + "raft::device_matrix_view>, " + "or std::nullopt."); + + std::optional permsOut_arg = std::forward(permsOut); + std::optional out_arg = std::forward(out); + permute(handle, in, permsOut_arg, out_arg); +} + +/** + * @brief Legacy overload of `permute` that takes raw arrays instead of mdspan. + * + * @tparam Type Type of each element of the input matrix to be permuted + * @tparam IntType Integer type of each element of the permsOut matrix + * @tparam IdxType Integer type of the dimensions of the matrices + * @tparam TPB threads per block (do not use any value other than the default) + * + * @param[out] perms If nonnull, the indices of the permutation + * @param[out] out If nonnull, the output matrix, containing the + * permuted rows of the input matrix @c in. (Not providing this + * is only useful if you provide @c perms.) + * @param[in] in input matrix + * @param[in] D number of columns in the matrices + * @param[in] N number of rows in the matrices + * @param[in] rowMajor true if the matrices are row major, + * false if they are column major + * @param[in] stream CUDA stream on which to run */ template void permute(IntType* perms, @@ -60,4 +193,4 @@ void permute(IntType* perms, }; // end namespace raft::random -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/random/permute.hpp b/cpp/include/raft/random/permute.hpp index 26e22e403b..a2fafa6574 100644 --- a/cpp/include/raft/random/permute.hpp +++ b/cpp/include/raft/random/permute.hpp @@ -18,50 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __PERMUTE_H -#define __PERMUTE_H - -#pragma once - -#include "detail/permute.cuh" - -namespace raft::random { - /** - * @brief Generate permutations of the input array. Pretty useful primitive for - * shuffling the input datasets in ML algos. See note at the end for some of its - * limitations! - * @tparam Type Data type of the array to be shuffled - * @tparam IntType Integer type used for ther perms array - * @tparam IdxType Integer type used for addressing indices - * @tparam TPB threads per block - * @param perms the output permutation indices. Typically useful only when - * one wants to refer back. If you don't need this, pass a nullptr - * @param out the output shuffled array. Pass nullptr if you don't want this to - * be written. For eg: when you only want the perms array to be filled. - * @param in input array (in-place is not supported due to race conditions!) - * @param D number of columns of the input array - * @param N length of the input array (or number of rows) - * @param rowMajor whether the input/output matrices are row or col major - * @param stream cuda stream where to launch the work - * - * @note This is NOT a uniform permutation generator! In fact, it only generates - * very small percentage of permutations. If your application really requires a - * high quality permutation generator, it is recommended that you pick - * Knuth Shuffle. + * DISCLAIMER: this file is deprecated: use permute.cuh instead */ -template -void permute(IntType* perms, - Type* out, - const Type* in, - IntType D, - IntType N, - bool rowMajor, - cudaStream_t stream) -{ - detail::permute(perms, out, in, D, N, rowMajor, stream); -} -}; // end namespace raft::random +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "permute.cuh" diff --git a/cpp/include/raft/random/rmat_rectangular_generator.cuh b/cpp/include/raft/random/rmat_rectangular_generator.cuh index aad1cf0c88..cedcca1711 100644 --- a/cpp/include/raft/random/rmat_rectangular_generator.cuh +++ b/cpp/include/raft/random/rmat_rectangular_generator.cuh @@ -21,48 +21,226 @@ namespace raft::random { /** - * @brief Generate RMAT for a rectangular shaped adjacency matrices (useful when - * graphs to be generated are bipartite) + * @brief Generate a bipartite RMAT graph for a rectangular adjacency matrix. * - * @tparam IdxT node indices type + * This is the most general of several overloads of `rmat_rectangular_gen` + * in this file, and thus has the most detailed documentation. + * + * @tparam IdxT Type of each node index + * @tparam ProbT Data type used for probability distributions (either fp32 or fp64) + * + * @param[in] handle RAFT handle, containing the CUDA stream on which to schedule work + * @param[in] r underlying state of the random generator. Especially useful when + * one wants to call this API for multiple times in order to generate + * a larger graph. For that case, just create this object with the + * initial seed once and after every call continue to pass the same + * object for the successive calls. + * @param[out] out Generated edgelist [on device], packed in array-of-structs fashion. + * In each row, the first element is the source node id, + * and the second element is the destination node id. + * @param[out] out_src Source node id's [on device]. + * @param[out] out_dst Destination node id's [on device]. `out_src` and `out_dst` + * together form the struct-of-arrays representation of the same + * output data as `out`. + * @param[in] theta distribution of each quadrant at each level of resolution. + * Since these are probabilities, each of the 2x2 matrices for + * each level of the RMAT must sum to one. [on device] + * [dim = max(r_scale, c_scale) x 2 x 2]. Of course, it is assumed + * that each of the group of 2 x 2 numbers all sum up to 1. + * @param[in] r_scale 2^r_scale represents the number of source nodes + * @param[in] c_scale 2^c_scale represents the number of destination nodes + * + * @pre `out.extent(0) == 2 * `out_src.extent(0)` is `true` + * @pre `out_src.extent(0) == out_dst.extent(0)` is `true` + * + * We call the `r_scale != c_scale` case the "rectangular adjacency matrix" case + * (in other words, generating bipartite graphs). In this case, at `depth >= r_scale`, + * the distribution is assumed to be: + * + * `[theta[4 * depth] + theta[4 * depth + 2], theta[4 * depth + 1] + theta[4 * depth + 3]; 0, 0]`. + * + * Then for `depth >= c_scale`, the distribution is assumed to be: + * + * `[theta[4 * depth] + theta[4 * depth + 1], 0; theta[4 * depth + 2] + theta[4 * depth + 3], 0]`. + * + * @note This can generate duplicate edges and self-loops. It is the responsibility of the + * caller to clean them up accordingly. + * + * @note This also only generates directed graphs. If undirected graphs are needed, then a + * separate post-processing step is expected to be done by the caller. + * + * @{ + */ +template +void rmat_rectangular_gen( + const raft::handle_t& handle, + raft::random::RngState& r, + raft::device_vector_view theta, + raft::device_mdspan, raft::row_major> out, + raft::device_vector_view out_src, + raft::device_vector_view out_dst, + IdxT r_scale, + IdxT c_scale) +{ + detail::rmat_rectangular_gen_output output(out, out_src, out_dst); + detail::rmat_rectangular_gen_impl(handle, r, theta, output, r_scale, c_scale); +} + +/** + * @brief Overload of `rmat_rectangular_gen` that only generates + * the struct-of-arrays (two vectors) output representation. + * + * This overload only generates the struct-of-arrays (two vectors) + * output representation: output vector `out_src` of source node id's, + * and output vector `out_dst` of destination node id's. + * + * @pre `out_src.extent(0) == out_dst.extent(0)` is `true` + */ +template +void rmat_rectangular_gen(const raft::handle_t& handle, + raft::random::RngState& r, + raft::device_vector_view theta, + raft::device_vector_view out_src, + raft::device_vector_view out_dst, + IdxT r_scale, + IdxT c_scale) +{ + detail::rmat_rectangular_gen_output output(out_src, out_dst); + detail::rmat_rectangular_gen_impl(handle, r, theta, output, r_scale, c_scale); +} + +/** + * @brief Overload of `rmat_rectangular_gen` that only generates + * the array-of-structs (one vector) output representation. + * + * This overload only generates the array-of-structs (one vector) + * output representation: a single output vector `out`, + * where in each row, the first element is the source node id, + * and the second element is the destination node id. + */ +template +void rmat_rectangular_gen( + const raft::handle_t& handle, + raft::random::RngState& r, + raft::device_vector_view theta, + raft::device_mdspan, raft::row_major> out, + IdxT r_scale, + IdxT c_scale) +{ + detail::rmat_rectangular_gen_output output(out); + detail::rmat_rectangular_gen_impl(handle, r, theta, output, r_scale, c_scale); +} + +/** + * @brief Overload of `rmat_rectangular_gen` that assumes the same + * a, b, c, d probability distributions across all the scales, + * and takes all three output vectors + * (`out` with the array-of-structs output representation, + * and `out_src` and `out_dst` with the struct-of-arrays + * output representation). + * + * `a`, `b, and `c` effectively replace the above overloads' + * `theta` parameter. + * + * @pre `out.extent(0) == 2 * `out_src.extent(0)` is `true` + * @pre `out_src.extent(0) == out_dst.extent(0)` is `true` + */ +template +void rmat_rectangular_gen( + const raft::handle_t& handle, + raft::random::RngState& r, + raft::device_mdspan, raft::row_major> out, + raft::device_vector_view out_src, + raft::device_vector_view out_dst, + ProbT a, + ProbT b, + ProbT c, + IdxT r_scale, + IdxT c_scale) +{ + detail::rmat_rectangular_gen_output output(out, out_src, out_dst); + detail::rmat_rectangular_gen_impl(handle, r, output, a, b, c, r_scale, c_scale); +} + +/** + * @brief Overload of `rmat_rectangular_gen` that assumes the same + * a, b, c, d probability distributions across all the scales, + * and takes only two output vectors + * (the struct-of-arrays output representation). + * + * `a`, `b, and `c` effectively replace the above overloads' + * `theta` parameter. + * + * @pre `out_src.extent(0) == out_dst.extent(0)` is `true` + */ +template +void rmat_rectangular_gen(const raft::handle_t& handle, + raft::random::RngState& r, + raft::device_vector_view out_src, + raft::device_vector_view out_dst, + ProbT a, + ProbT b, + ProbT c, + IdxT r_scale, + IdxT c_scale) +{ + detail::rmat_rectangular_gen_output output(out_src, out_dst); + detail::rmat_rectangular_gen_impl(handle, r, output, a, b, c, r_scale, c_scale); +} + +/** + * @brief Overload of `rmat_rectangular_gen` that assumes the same + * a, b, c, d probability distributions across all the scales, + * and takes only one output vector + * (the array-of-structs output representation). + * + * `a`, `b, and `c` effectively replace the above overloads' + * `theta` parameter. + */ +template +void rmat_rectangular_gen( + const raft::handle_t& handle, + raft::random::RngState& r, + raft::device_mdspan, raft::row_major> out, + ProbT a, + ProbT b, + ProbT c, + IdxT r_scale, + IdxT c_scale) +{ + detail::rmat_rectangular_gen_output output(out); + detail::rmat_rectangular_gen_impl(handle, r, output, a, b, c, r_scale, c_scale); +} + +/** + * @brief Legacy overload of `rmat_rectangular_gen` + * taking raw arrays instead of mdspan. + * + * @tparam IdxT type of each node index * @tparam ProbT data type used for probability distributions (either fp32 or fp64) * - * @param[out] out generated edgelist [on device] [dim = n_edges x 2]. On each row - * the first element corresponds to the source node id while the - * second, the destination node id. If you don't need this output + * @param[out] out generated edgelist [on device] [dim = n_edges x 2]. In each row + * the first element is the source node id, and the second element + * is the destination node id. If you don't need this output * then pass a `nullptr` in its place. * @param[out] out_src list of source node id's [on device] [len = n_edges]. If you * don't need this output then pass a `nullptr` in its place. * @param[out] out_dst list of destination node id's [on device] [len = n_edges]. If * you don't need this output then pass a `nullptr` in its place. * @param[in] theta distribution of each quadrant at each level of resolution. - * Since these are probabilities, each of the 2x2 matrix for + * Since these are probabilities, each of the 2x2 matrices for * each level of the RMAT must sum to one. [on device] * [dim = max(r_scale, c_scale) x 2 x 2]. Of course, it is assumed * that each of the group of 2 x 2 numbers all sum up to 1. * @param[in] r_scale 2^r_scale represents the number of source nodes * @param[in] c_scale 2^c_scale represents the number of destination nodes * @param[in] n_edges number of edges to generate - * @param[in] stream cuda stream to schedule the work on + * @param[in] stream cuda stream on which to schedule the work * @param[in] r underlying state of the random generator. Especially useful when * one wants to call this API for multiple times in order to generate * a larger graph. For that case, just create this object with the * initial seed once and after every call continue to pass the same * object for the successive calls. - * - * When `r_scale != c_scale` it is referred to as rectangular adjacency matrix case (IOW generating - * bipartite graphs). In this case, at `depth >= r_scale`, the distribution is assumed to be: - * `[theta[4 * depth] + theta[4 * depth + 2], theta[4 * depth + 1] + theta[4 * depth + 3]; 0, 0]`. - * Then for the `depth >= c_scale`, the distribution is assumed to be: - * `[theta[4 * depth] + theta[4 * depth + 1], 0; theta[4 * depth + 2] + theta[4 * depth + 3], 0]`. - * - * @note This can generate duplicate edges and self-loops. It is the responsibility of the - * caller to clean them up accordingly. - - * @note This also only generates directed graphs. If undirected graphs are needed, then a - * separate post-processing step is expected to be done by the caller. - * - * @{ */ template void rmat_rectangular_gen(IdxT* out, @@ -80,8 +258,10 @@ void rmat_rectangular_gen(IdxT* out, } /** - * This is the same as the previous method but assumes the same a, b, c, d probability - * distributions across all the scales + * @brief Legacy overload of `rmat_rectangular_gen` + * taking raw arrays instead of mdspan. + * This overload assumes the same a, b, c, d probability distributions + * across all the scales. */ template void rmat_rectangular_gen(IdxT* out, @@ -99,6 +279,7 @@ void rmat_rectangular_gen(IdxT* out, detail::rmat_rectangular_gen_caller( out, out_src, out_dst, a, b, c, r_scale, c_scale, n_edges, stream, r); } + /** @} */ } // end namespace raft::random diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh index 85d9abe263..8ea985b559 100644 --- a/cpp/include/raft/random/rng.cuh +++ b/cpp/include/raft/random/rng.cuh @@ -19,13 +19,40 @@ #include "detail/rng_impl.cuh" #include "detail/rng_impl_deprecated.cuh" // necessary for now (to be removed) #include "rng_state.hpp" +#include +#include +#include #include +#include +#include namespace raft::random { /** * @brief Generate uniformly distributed numbers in the given range * + * @tparam OutputValueType Data type of output random number + * @tparam Index Data type used to represent length of the arrays + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out the output array + * @param[in] start start of the range + * @param[in] end end of the range + */ +template +void uniform(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType start, + OutputValueType end) +{ + detail::uniform(rng_state, out.data_handle(), out.extent(0), start, end, handle.get_stream()); +} + +/** + * @brief Legacy overload of `uniform` taking raw pointers + * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management @@ -49,6 +76,34 @@ void uniform(const raft::handle_t& handle, /** * @brief Generate uniformly distributed integers in the given range * + * @tparam OutputValueType Integral type; value type of the output vector + * @tparam IndexType Type used to represent length of the output vector + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out the output vector of random numbers + * @param[in] start start of the range + * @param[in] end end of the range + */ +template +void uniformInt(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType start, + OutputValueType end) +{ + static_assert( + std::is_same::type>::value, + "uniformInt: The output vector must be a view of nonconst, " + "so that we can write to it."); + static_assert(std::is_integral::value, + "uniformInt: The elements of the output vector must have integral type."); + detail::uniformInt(rng_state, out.data_handle(), out.extent(0), start, end, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `uniformInt` + * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management @@ -71,6 +126,29 @@ void uniformInt(const raft::handle_t& handle, /** * @brief Generate normal distributed numbers + * with a given mean and standard deviation + * + * @tparam OutputValueType data type of output random number + * @tparam IndexType data type used to represent length of the arrays + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out the output array + * @param[in] mu mean of the distribution + * @param[in] sigma std-dev of the distribution + */ +template +void normal(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType mu, + OutputValueType sigma) +{ + detail::normal(rng_state, out.data_handle(), out.extent(0), mu, sigma, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `normal`. * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays @@ -95,6 +173,35 @@ void normal(const raft::handle_t& handle, /** * @brief Generate normal distributed integers * + * @tparam OutputValueType Integral type; value type of the output vector + * @tparam IndexType Integral type of the output vector's length + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out the output array + * @param[in] mu mean of the distribution + * @param[in] sigma standard deviation of the distribution + */ +template +void normalInt(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType mu, + OutputValueType sigma) +{ + static_assert( + std::is_same::type>::value, + "normalInt: The output vector must be a view of nonconst, " + "so that we can write to it."); + static_assert(std::is_integral::value, + "normalInt: The output vector's value type must be an integer."); + + detail::normalInt(rng_state, out.data_handle(), out.extent(0), mu, sigma, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `normalInt` + * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management @@ -121,7 +228,70 @@ void normalInt(const raft::handle_t& handle, * * Each row in this table conforms to a normally distributed n-dim vector * whose mean is the input vector and standard deviation is the corresponding - * vector or scalar. Correlations among the dimensions itself is assumed to + * vector or scalar. Correlations among the dimensions itself are assumed to + * be absent. + * + * @tparam OutputValueType data type of output random number + * @tparam IndexType data type used to represent length of the arrays + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[in] mu_vec mean vector (of length `out.extent(1)`) + * @param[in] sigma Either the standard-deviation vector + * (of length `out.extent(1)`) of each component, + * or a scalar standard deviation for all components. + * @param[out] out the output table + */ +template +void normalTable( + const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view mu_vec, + std::variant, OutputValueType> sigma, + raft::device_matrix_view out) +{ + const OutputValueType* sigma_vec_ptr = nullptr; + OutputValueType sigma_value{}; + + using sigma_vec_type = raft::device_vector_view; + if (std::holds_alternative(sigma)) { + auto sigma_vec = std::get(sigma); + RAFT_EXPECTS(sigma_vec.extent(0) == out.extent(1), + "normalTable: The sigma vector " + "has length %zu, which does not equal the number of columns " + "in the output table %zu.", + static_cast(sigma_vec.extent(0)), + static_cast(out.extent(1))); + // The extra length check makes this work even if sigma_vec views a std::vector, + // where .data() need not return nullptr even if .size() is zero. + sigma_vec_ptr = sigma_vec.extent(0) == 0 ? nullptr : sigma_vec.data_handle(); + } else { + sigma_value = std::get(sigma); + } + + RAFT_EXPECTS(mu_vec.extent(0) == out.extent(1), + "normalTable: The mu vector " + "has length %zu, which does not equal the number of columns " + "in the output table %zu.", + static_cast(mu_vec.extent(0)), + static_cast(out.extent(1))); + + detail::normalTable(rng_state, + out.data_handle(), + out.extent(0), + out.extent(1), + mu_vec.data_handle(), + sigma_vec_ptr, + sigma_value, + handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `normalTable`. + * + * Each row in this table conforms to a normally distributed n-dim vector + * whose mean is the input vector and standard deviation is the corresponding + * vector or scalar. Correlations among the dimensions itself are assumed to * be absent. * * @tparam OutType data type of output random number @@ -151,7 +321,27 @@ void normalTable(const raft::handle_t& handle, } /** - * @brief Fill an array with the given value + * @brief Fill a vector with the given value + * + * @tparam OutputValueType Value type of the output vector + * @tparam IndexType Integral type used to represent length of the output vector + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[in] val value with which to fill the output vector + * @param[out] out the output vector + */ +template +void fill(const raft::handle_t& handle, + RngState& rng_state, + OutputValueType val, + raft::device_vector_view out) +{ + detail::fill(rng_state, out.data_handle(), out.extent(0), val, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `fill` * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays @@ -170,6 +360,28 @@ void fill(const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenTy /** * @brief Generate bernoulli distributed boolean array * + * @tparam OutputValueType Type of each element of the output vector; + * must be able to represent boolean values (e.g., `bool`) + * @tparam IndexType Integral type of the output vector's length + * @tparam Type Data type in which to compute the probabilities + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out the output vector + * @param[in] prob coin-toss probability for heads + */ +template +void bernoulli(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + Type prob) +{ + detail::bernoulli(rng_state, out.data_handle(), out.extent(0), prob, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `bernoulli` + * * @tparam Type data type in which to compute the probabilities * @tparam OutType output data type * @tparam LenType data type used to represent length of the arrays @@ -190,6 +402,29 @@ void bernoulli( /** * @brief Generate bernoulli distributed array and applies scale * + * @tparam OutputValueType Data type in which to compute the probabilities + * @tparam IndexType Integral type of the output vector's length + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out the output vector + * @param[in] prob coin-toss probability for heads + * @param[in] scale scaling factor + */ +template +void scaled_bernoulli(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType prob, + OutputValueType scale) +{ + detail::scaled_bernoulli( + rng_state, out.data_handle(), out.extent(0), prob, scale, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `scaled_bernoulli` + * * @tparam OutType data type in which to compute the probabilities * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management @@ -213,6 +448,29 @@ void scaled_bernoulli(const raft::handle_t& handle, /** * @brief Generate Gumbel distributed random numbers * + * @tparam OutputValueType data type of output random number + * @tparam IndexType data type used to represent length of the arrays + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out output array + * @param[in] mu mean value + * @param[in] beta scale value + * @note https://en.wikipedia.org/wiki/Gumbel_distribution + */ +template +void gumbel(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType mu, + OutputValueType beta) +{ + detail::gumbel(rng_state, out.data_handle(), out.extent(0), mu, beta, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `gumbel`. + * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management @@ -237,6 +495,28 @@ void gumbel(const raft::handle_t& handle, /** * @brief Generate lognormal distributed numbers * + * @tparam OutputValueType data type of output random number + * @tparam IndexType data type used to represent length of the arrays + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out the output array + * @param[in] mu mean of the distribution + * @param[in] sigma standard deviation of the distribution + */ +template +void lognormal(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType mu, + OutputValueType sigma) +{ + detail::lognormal(rng_state, out.data_handle(), out.extent(0), mu, sigma, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `lognormal`. + * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management @@ -244,7 +524,7 @@ void gumbel(const raft::handle_t& handle, * @param[out] ptr the output array * @param[in] len the number of elements in the output * @param[in] mu mean of the distribution - * @param[in] sigma std-dev of the distribution + * @param[in] sigma standard deviation of the distribution */ template void lognormal(const raft::handle_t& handle, @@ -260,6 +540,28 @@ void lognormal(const raft::handle_t& handle, /** * @brief Generate logistic distributed random numbers * + * @tparam OutputValueType data type of output random number + * @tparam IndexType data type used to represent length of the arrays + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out output array + * @param[in] mu mean value + * @param[in] scale scale value + */ +template +void logistic(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType mu, + OutputValueType scale) +{ + detail::logistic(rng_state, out.data_handle(), out.extent(0), mu, scale, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `logistic`. + * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management @@ -283,13 +585,33 @@ void logistic(const raft::handle_t& handle, /** * @brief Generate exponentially distributed random numbers * + * @tparam OutputValueType data type of output random number + * @tparam IndexType data type used to represent length of the arrays + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out output array + * @param[in] lambda the exponential distribution's lambda parameter + */ +template +void exponential(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType lambda) +{ + detail::exponential(rng_state, out.data_handle(), out.extent(0), lambda, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `exponential`. + * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management * @param[in] rng_state random number generator state * @param[out] ptr output array * @param[in] len number of elements in the output array - * @param[in] lambda the lambda + * @param[in] lambda the exponential distribution's lambda parameter */ template void exponential( @@ -301,13 +623,33 @@ void exponential( /** * @brief Generate rayleigh distributed random numbers * + * @tparam OutputValueType data type of output random number + * @tparam IndexType data type used to represent length of the arrays + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out output array + * @param[in] sigma the distribution's sigma parameter + */ +template +void rayleigh(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType sigma) +{ + detail::rayleigh(rng_state, out.data_handle(), out.extent(0), sigma, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `rayleigh`. + * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management * @param[in] rng_state random number generator state * @param[out] ptr output array * @param[in] len number of elements in the output array - * @param[in] sigma the sigma + * @param[in] sigma the distribution's sigma parameter */ template void rayleigh( @@ -319,6 +661,28 @@ void rayleigh( /** * @brief Generate laplace distributed random numbers * + * @tparam OutputValueType data type of output random number + * @tparam IndexType data type used to represent length of the arrays + * + * @param[in] handle raft handle for resource management + * @param[in] rng_state random number generator state + * @param[out] out output array + * @param[in] mu the mean + * @param[in] scale the scale + */ +template +void laplace(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view out, + OutputValueType mu, + OutputValueType scale) +{ + detail::laplace(rng_state, out.data_handle(), out.extent(0), mu, scale, handle.get_stream()); +} + +/** + * @brief Legacy raw pointer overload of `laplace`. + * * @tparam OutType data type of output random number * @tparam LenType data type used to represent length of the arrays * @param[in] handle raft handle for resource management @@ -340,20 +704,143 @@ void laplace(const raft::handle_t& handle, } /** - * @brief Sample the input array without replacement, optionally based on the - * input weight vector for each element in the array + * @brief Sample the input vector without replacement, optionally based on the + * input weight vector for each element in the array. + * + * The implementation is based on the `one-pass sampling` algorithm described in + * ["Accelerating weighted random sampling without + * replacement,"](https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf) + * a technical report by Kirill Mueller. + * + * If no input weight vector is provided, then input elements will be + * sampled uniformly. Otherwise, the elements sampled from the input + * vector will always appear in increasing order of their weights as + * computed using the exponential distribution. So, if you are + * particular about the order (for e.g., array permutations), then + * this might not be the right choice. * - * Implementation here is based on the `one-pass sampling` algo described here: - * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf + * @tparam DataT type of each element of the input array @c in + * @tparam IdxT type of the dimensions of the arrays; output index type + * @tparam WeightsT type of each elements of the weights array @c wts * - * @note In the sampled array the elements which are picked will always appear - * in the increasing order of their weights as computed using the exponential - * distribution. So, if you're particular about the order (for eg. array - * permutations), then this might not be the right choice! + * @note Please do not specify template parameters explicitly, + * as the compiler can deduce them from the arguments. + * + * @param[in] handle RAFT handle containing (among other resources) + * the CUDA stream on which to run. + * @param[inout] rng_state Pseudorandom number generator state. + * @param[in] in Input vector to be sampled. + * @param[in] wts Optional weights vector. + * If not provided, uniform sampling will be used. + * @param[out] out Vector of samples from the input vector. + * @param[out] outIdx If provided, vector of the indices + * sampled from the input array. + * + * @pre The number of samples `out.extent(0)` + * is less than or equal to the number of inputs `in.extent(0)`. + * + * @pre The number of weights `wts.extent(0)` + * equals the number of inputs `in.extent(0)`. + */ +template +void sample_without_replacement(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view in, + std::optional> wts, + raft::device_vector_view out, + std::optional> outIdx) +{ + static_assert(std::is_integral::value, "IdxT must be an integral type."); + const IdxT sampledLen = out.extent(0); + const IdxT len = in.extent(0); + RAFT_EXPECTS(sampledLen <= len, + "sampleWithoutReplacement: " + "sampledLen (out.extent(0)) must be <= len (in.extent(0))"); + RAFT_EXPECTS(len == 0 || in.data_handle() != nullptr, + "sampleWithoutReplacement: " + "If in.extents(0) != 0, then in.data_handle() must be nonnull"); + RAFT_EXPECTS(sampledLen == 0 || out.data_handle() != nullptr, + "sampleWithoutReplacement: " + "If out.extents(0) != 0, then out.data_handle() must be nonnull"); + + const bool outIdx_has_value = outIdx.has_value(); + if (outIdx_has_value) { + RAFT_EXPECTS((*outIdx).extent(0) == sampledLen, + "sampleWithoutReplacement: " + "If outIdx is provided, its extent(0) must equal out.extent(0)"); + } + IdxT* outIdx_ptr = outIdx_has_value ? (*outIdx).data_handle() : nullptr; + + const bool wts_has_value = wts.has_value(); + if (wts_has_value) { + RAFT_EXPECTS((*wts).extent(0) == len, + "sampleWithoutReplacement: " + "If wts is provided, its extent(0) must equal in.extent(0)"); + } + const WeightsT* wts_ptr = wts_has_value ? (*wts).data_handle() : nullptr; + + detail::sampleWithoutReplacement(rng_state, + out.data_handle(), + outIdx_ptr, + in.data_handle(), + wts_ptr, + sampledLen, + len, + handle.get_stream()); +} + +namespace sample_without_replacement_impl { +template +struct weight_alias { +}; + +template <> +struct weight_alias { + using type = double; +}; + +template +struct weight_alias>> { + using type = typename raft::device_vector_view::value_type; +}; + +template +using weight_t = typename weight_alias::type; +} // namespace sample_without_replacement_impl + +/** + * @brief Overload of `sample_without_replacement` to help the + * compiler find the above overload, in case users pass in + * `std::nullopt` for one or both of the optional arguments. + * + * Please see above for documentation of `sample_without_replacement`. + */ +template +void sample_without_replacement(const raft::handle_t& handle, + RngState& rng_state, + raft::device_vector_view in, + WeightsVectorType&& wts, + raft::device_vector_view out, + OutIndexVectorType&& outIdx) +{ + using weight_type = sample_without_replacement_impl::weight_t< + std::remove_const_t>>; + std::optional> weights = + std::forward(wts); + std::optional> output_indices = + std::forward(outIdx); + + sample_without_replacement(handle, rng_state, in, weights, out, output_indices); +} + +/** + * @brief Legacy version of @c sample_without_replacement (see above) + * that takes raw arrays instead of device mdspan. * * @tparam DataT data type * @tparam WeightsT weights type * @tparam IdxT index type + * * @param[in] handle raft handle for resource management * @param[in] rng_state random number generator state * @param[out] out output sampled array (of length 'sampledLen') diff --git a/cpp/include/raft/random/rng_state.hpp b/cpp/include/raft/random/rng_state.hpp index 44372902b1..ec15ef286f 100644 --- a/cpp/include/raft/random/rng_state.hpp +++ b/cpp/include/raft/random/rng_state.hpp @@ -19,6 +19,8 @@ #pragma once +#include + namespace raft { namespace random { diff --git a/cpp/include/raft/lap/detail/lap_functions.cuh b/cpp/include/raft/solver/detail/lap_functions.cuh similarity index 92% rename from cpp/include/raft/lap/detail/lap_functions.cuh rename to cpp/include/raft/solver/detail/lap_functions.cuh index 1c97392a87..cbfe12fd23 100644 --- a/cpp/include/raft/lap/detail/lap_functions.cuh +++ b/cpp/include/raft/solver/detail/lap_functions.cuh @@ -24,11 +24,11 @@ */ #pragma once -#include "d_structs.h" +#include -#include -#include -#include +#include +#include +#include #include #include @@ -39,9 +39,7 @@ #include -namespace raft { -namespace lap { -namespace detail { +namespace raft::solver::detail { const int BLOCKDIMX{64}; const int BLOCKDIMY{1}; @@ -110,8 +108,7 @@ inline void initialReduction(raft::handle_t const& handle, dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateRectangularDims( - blocks_per_grid, threads_per_block, total_blocks, N, SP); + detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP); kernel_rowReduction<<>>( d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits::max()); @@ -149,8 +146,7 @@ inline void computeInitialAssignments(raft::handle_t const& handle, thrust::fill_n(thrust::device, row_lock_v.data(), size, 0); thrust::fill_n(thrust::device, col_lock_v.data(), size, 0); - raft::lap::detail::calculateRectangularDims( - blocks_per_grid, threads_per_block, total_blocks, N, SP); + detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP); kernel_computeInitialAssignments<<>>( d_costs, @@ -191,8 +187,7 @@ inline int computeRowCovers(raft::handle_t const& handle, thrust::fill_n(thrust::device, d_col_data.parents, size, vertex_t{-1}); thrust::fill_n(thrust::device, d_col_data.children, size, vertex_t{-1}); - raft::lap::detail::calculateRectangularDims( - blocks_per_grid, threads_per_block, total_blocks, N, SP); + detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP); kernel_computeRowCovers<<>>( d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N); @@ -219,8 +214,7 @@ inline void coverZeroAndExpand(raft::handle_t const& handle, dim3 blocks_per_grid; dim3 threads_per_block; - raft::lap::detail::calculateRectangularDims( - blocks_per_grid, threads_per_block, total_blocks, N, SP); + detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP); kernel_coverAndExpand<<>>( d_flag, @@ -266,8 +260,7 @@ inline vertex_t zeroCoverIteration(raft::handle_t const& handle, thrust::fill_n(thrust::device, csr_ptrs_v.data(), (SP + 1), vertex_t{-1}); - raft::lap::detail::calculateRectangularDims( - blocks_per_grid, threads_per_block, total_blocks, N, SP); + detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP); // construct predicate matrix for edges. kernel_rowPredicateConstructionCSR<< predicates_v(size, handle.get_stream()); rmm::device_uvector addresses_v(size, handle.get_stream()); @@ -375,8 +368,7 @@ inline void reversePass(raft::handle_t const& handle, int total_blocks_1 = 0; dim3 blocks_per_grid_1; dim3 threads_per_block_1; - raft::lap::detail::calculateLinearDims( - blocks_per_grid_1, threads_per_block_1, total_blocks_1, csr_size); + detail::calculateLinearDims(blocks_per_grid_1, threads_per_block_1, total_blocks_1, csr_size); rmm::device_uvector elements_v(csr_size, handle.get_stream()); @@ -403,7 +395,7 @@ inline void augmentationPass(raft::handle_t const& handle, int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); + detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); rmm::device_uvector predicates_v(SP * N, handle.get_stream()); rmm::device_uvector addresses_v(SP * N, handle.get_stream()); @@ -432,7 +424,7 @@ inline void augmentationPass(raft::handle_t const& handle, int total_blocks_1 = 0; dim3 blocks_per_grid_1; dim3 threads_per_block_1; - raft::lap::detail::calculateLinearDims( + detail::calculateLinearDims( blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size); rmm::device_uvector elements_v(row_ids_csr_size, handle.get_stream()); @@ -470,7 +462,7 @@ inline void dualUpdate(raft::handle_t const& handle, rmm::device_uvector sp_min_v(SP, handle.get_stream()); - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); + detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); kernel_dualUpdate_1<<>>( sp_min_v.data(), d_vertices_dev.col_slacks, @@ -481,8 +473,7 @@ inline void dualUpdate(raft::handle_t const& handle, CHECK_CUDA(handle.get_stream()); - raft::lap::detail::calculateRectangularDims( - blocks_per_grid, threads_per_block, total_blocks, N, SP); + detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP); kernel_dualUpdate_2<<>>( sp_min_v.data(), d_vertices_dev.row_duals, @@ -512,7 +503,7 @@ inline void calcObjValDual(raft::handle_t const& handle, dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); + detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); kernel_calcObjValDual<<>>( d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N); @@ -533,7 +524,7 @@ inline void calcObjValPrimal(raft::handle_t const& handle, dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); + detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); kernel_calcObjValPrimal<<>>( d_obj_val, d_costs, d_row_assignments, SP, N); @@ -541,6 +532,4 @@ inline void calcObjValPrimal(raft::handle_t const& handle, CHECK_CUDA(handle.get_stream()); } -} // namespace detail -} // namespace lap -} // namespace raft +} // namespace raft::solver::detail diff --git a/cpp/include/raft/lap/detail/lap_kernels.cuh b/cpp/include/raft/solver/detail/lap_kernels.cuh similarity index 98% rename from cpp/include/raft/lap/detail/lap_kernels.cuh rename to cpp/include/raft/solver/detail/lap_kernels.cuh index 728acdf7df..d66a9d72d5 100644 --- a/cpp/include/raft/lap/detail/lap_kernels.cuh +++ b/cpp/include/raft/solver/detail/lap_kernels.cuh @@ -24,19 +24,16 @@ */ #pragma once -#include "d_structs.h" +#include "../linear_assignment_types.hpp" -#include -#include +#include +#include #include #include #include -namespace raft { -namespace lap { -namespace detail { - +namespace raft::solver::detail { const int DORMANT{0}; const int ACTIVE{1}; const int VISITED{2}; @@ -555,6 +552,4 @@ __global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal, } } -} // namespace detail -} // namespace lap -} // namespace raft +} // namespace raft::solver::detail \ No newline at end of file diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh new file mode 100644 index 0000000000..4c24dcbc29 --- /dev/null +++ b/cpp/include/raft/solver/linear_assignment.cuh @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright 2020 KETAN DATE & RAKESH NAGI + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm + * Authors: Ketan Date and Rakesh Nagi + * + * Article reference: + * Date, Ketan, and Rakesh Nagi. "GPU-accelerated Hungarian algorithms + * for the Linear Assignment Problem." Parallel Computing 57 (2016): 52-72. + * + */ + +#ifndef __LAP_H +#define __LAP_H + +#pragma once + +#include +#include + +#include +#include + +#include +#include + +namespace raft::solver { + +template +class LinearAssignmentProblem { + vertex_t size_; + vertex_t batchsize_; + weight_t epsilon_; + + weight_t const* d_costs_; + + Vertices d_vertices_dev; + VertexData d_row_data_dev, d_col_data_dev; + + raft::handle_t const& handle_; + rmm::device_uvector row_covers_v; + rmm::device_uvector col_covers_v; + rmm::device_uvector row_duals_v; + rmm::device_uvector col_duals_v; + rmm::device_uvector col_slacks_v; + rmm::device_uvector row_is_visited_v; + rmm::device_uvector col_is_visited_v; + rmm::device_uvector row_parents_v; + rmm::device_uvector col_parents_v; + rmm::device_uvector row_children_v; + rmm::device_uvector col_children_v; + rmm::device_uvector obj_val_primal_v; + rmm::device_uvector obj_val_dual_v; + + public: + LinearAssignmentProblem(raft::handle_t const& handle, + vertex_t size, + vertex_t batchsize, + weight_t epsilon) + : handle_(handle), + size_(size), + batchsize_(batchsize), + epsilon_(epsilon), + d_costs_(nullptr), + row_covers_v(0, handle_.get_stream()), + col_covers_v(0, handle_.get_stream()), + row_duals_v(0, handle_.get_stream()), + col_duals_v(0, handle_.get_stream()), + col_slacks_v(0, handle_.get_stream()), + row_is_visited_v(0, handle_.get_stream()), + col_is_visited_v(0, handle_.get_stream()), + row_parents_v(0, handle_.get_stream()), + col_parents_v(0, handle_.get_stream()), + row_children_v(0, handle_.get_stream()), + col_children_v(0, handle_.get_stream()), + obj_val_primal_v(0, handle_.get_stream()), + obj_val_dual_v(0, handle_.get_stream()) + { + } + + // Executes Hungarian algorithm on the input cost matrix. + void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment) + { + initializeDevice(); + + d_vertices_dev.row_assignments = d_row_assignment; + d_vertices_dev.col_assignments = d_col_assignment; + + d_costs_ = d_cost_matrix; + + int step = 0; + + while (step != 100) { + switch (step) { + case 0: step = hungarianStep0(); break; + case 1: step = hungarianStep1(); break; + case 2: step = hungarianStep2(); break; + case 3: step = hungarianStep3(); break; + case 4: step = hungarianStep4(); break; + case 5: step = hungarianStep5(); break; + case 6: step = hungarianStep6(); break; + } + } + + d_costs_ = nullptr; + } + + // Function for getting optimal row dual vector for subproblem spId. + std::pair getRowDualVector(int spId) const + { + return std::make_pair(row_duals_v.data() + spId * size_, size_); + } + + // Function for getting optimal col dual vector for subproblem spId. + std::pair getColDualVector(int spId) + { + return std::make_pair(col_duals_v.data() + spId * size_, size_); + } + + // Function for getting optimal primal objective value for subproblem spId. + weight_t getPrimalObjectiveValue(int spId) + { + weight_t result; + raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream()); + CHECK_CUDA(handle_.get_stream()); + return result; + } + + // Function for getting optimal dual objective value for subproblem spId. + weight_t getDualObjectiveValue(int spId) + { + weight_t result; + raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream()); + CHECK_CUDA(handle_.get_stream()); + return result; + } + + private: + // Helper function for initializing global variables and arrays on a single host. + void initializeDevice() + { + cudaStream_t stream = handle_.get_stream(); + row_covers_v.resize(batchsize_ * size_, stream); + col_covers_v.resize(batchsize_ * size_, stream); + row_duals_v.resize(batchsize_ * size_, stream); + col_duals_v.resize(batchsize_ * size_, stream); + col_slacks_v.resize(batchsize_ * size_, stream); + row_is_visited_v.resize(batchsize_ * size_, stream); + col_is_visited_v.resize(batchsize_ * size_, stream); + row_parents_v.resize(batchsize_ * size_, stream); + col_parents_v.resize(batchsize_ * size_, stream); + row_children_v.resize(batchsize_ * size_, stream); + col_children_v.resize(batchsize_ * size_, stream); + obj_val_primal_v.resize(batchsize_, stream); + obj_val_dual_v.resize(batchsize_, stream); + + d_vertices_dev.row_covers = row_covers_v.data(); + d_vertices_dev.col_covers = col_covers_v.data(); + + d_vertices_dev.row_duals = row_duals_v.data(); + d_vertices_dev.col_duals = col_duals_v.data(); + d_vertices_dev.col_slacks = col_slacks_v.data(); + + d_row_data_dev.is_visited = row_is_visited_v.data(); + d_col_data_dev.is_visited = col_is_visited_v.data(); + d_row_data_dev.parents = row_parents_v.data(); + d_row_data_dev.children = row_children_v.data(); + d_col_data_dev.parents = col_parents_v.data(); + d_col_data_dev.children = col_children_v.data(); + + thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0}); + thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0}); + thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0}); + thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0}); + } + + // Function for calculating initial zeros by subtracting row and column minima from each element. + int hungarianStep0() + { + detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_); + + return 1; + } + + // Function for calculating initial zeros by subtracting row and column minima from each element. + int hungarianStep1() + { + detail::computeInitialAssignments( + handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_); + + int next = 2; + + while (true) { + if ((next = hungarianStep2()) == 6) break; + + if ((next = hungarianStep3()) == 5) break; + + hungarianStep4(); + } + + return next; + } + + // Function for checking optimality and constructing predicates and covers. + int hungarianStep2() + { + int cover_count = detail::computeRowCovers( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); + + int next = (cover_count == batchsize_ * size_) ? 6 : 3; + + return next; + } + + // Function for building alternating tree rooted at unassigned rows. + int hungarianStep3() + { + int next; + + rmm::device_scalar flag_v(handle_.get_stream()); + + bool h_flag = false; + flag_v.set_value_async(h_flag, handle_.get_stream()); + + detail::executeZeroCover(handle_, + d_costs_, + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + flag_v.data(), + batchsize_, + size_, + epsilon_); + + h_flag = flag_v.value(handle_.get_stream()); + + next = h_flag ? 4 : 5; + + return next; + } + + // Function for augmenting the solution along multiple node-disjoint alternating trees. + int hungarianStep4() + { + detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_); + + detail::augmentationPass( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); + + return 2; + } + + // Function for updating dual solution to introduce new zero-cost arcs. + int hungarianStep5() + { + detail::dualUpdate( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_); + + return 3; + } + + // Function for calculating primal and dual objective values at optimality. + int hungarianStep6() + { + detail::calcObjValPrimal(handle_, + obj_val_primal_v.data(), + d_costs_, + d_vertices_dev.row_assignments, + batchsize_, + size_); + + detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_); + + return 100; + } +}; + +} // namespace raft::solver + +#endif \ No newline at end of file diff --git a/cpp/include/raft/lap/detail/d_structs.h b/cpp/include/raft/solver/linear_assignment_types.hpp similarity index 96% rename from cpp/include/raft/lap/detail/d_structs.h rename to cpp/include/raft/solver/linear_assignment_types.hpp index 74679d64ce..3f81d3898d 100644 --- a/cpp/include/raft/lap/detail/d_structs.h +++ b/cpp/include/raft/solver/linear_assignment_types.hpp @@ -24,6 +24,7 @@ */ #pragma once +namespace raft::solver { template struct Vertices { vertex_t* row_assignments; @@ -41,3 +42,4 @@ struct VertexData { vertex_t* children; int* is_visited; }; +} // namespace raft::solver diff --git a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh index e55627c936..4549fbe343 100644 --- a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh +++ b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh @@ -18,10 +18,10 @@ #include -#include -#include -#include -#include +#include +#include +#include +#include #include namespace raft { @@ -29,6 +29,9 @@ namespace sparse { namespace convert { namespace detail { +// Threads per block in adj_to_csr_kernel. +static const constexpr int adj_to_csr_tpb = 512; + /** * @brief Convert dense adjacency matrix into unsorted CSR format. * @@ -58,13 +61,14 @@ namespace detail { * the number of non-zeros in `adj`. */ template -__global__ void adj_to_csr_kernel(const bool* adj, // row-major adjacency matrix - const index_t* row_ind, // precomputed row indices - index_t num_rows, // # rows of adj - index_t num_cols, // # cols of adj - index_t* row_counters, // pre-allocated (zeroed) atomic counters - index_t* out_col_ind // output column indices -) +__global__ void __launch_bounds__(adj_to_csr_tpb) + adj_to_csr_kernel(const bool* adj, // row-major adjacency matrix + const index_t* row_ind, // precomputed row indices + index_t num_rows, // # rows of adj + index_t num_cols, // # cols of adj + index_t* row_counters, // pre-allocated (zeroed) atomic counters + index_t* out_col_ind // output column indices + ) { const int chunk_size = 16; typedef raft::TxN_t chunk_bool; @@ -148,17 +152,16 @@ void adj_to_csr(const raft::handle_t& handle, // independently). If the maximum number of active blocks (num_sms * // occupancy) exceeds the number of rows, assign multiple blocks to a single // row. - int threads_per_block = 1024; int dev_id, sm_count, blocks_per_sm; cudaGetDevice(&dev_id); cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &blocks_per_sm, adj_to_csr_kernel, threads_per_block, 0); + &blocks_per_sm, adj_to_csr_kernel, adj_to_csr_tpb, 0); index_t max_active_blocks = sm_count * blocks_per_sm; index_t blocks_per_row = raft::ceildiv(max_active_blocks, num_rows); index_t grid_rows = raft::ceildiv(max_active_blocks, blocks_per_row); - dim3 block(threads_per_block, 1); + dim3 block(adj_to_csr_tpb, 1); dim3 grid(blocks_per_row, grid_rows); adj_to_csr_kernel diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh index 2d13bfa34e..7cc4770138 100644 --- a/cpp/include/raft/sparse/convert/detail/coo.cuh +++ b/cpp/include/raft/sparse/convert/detail/coo.cuh @@ -17,9 +17,9 @@ #pragma once #include -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh index d945a3c785..acb77de358 100644 --- a/cpp/include/raft/sparse/convert/detail/csr.cuh +++ b/cpp/include/raft/sparse/convert/detail/csr.cuh @@ -18,10 +18,10 @@ #include -#include -#include -#include +#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh index 4f97cee8b4..2be887e836 100644 --- a/cpp/include/raft/sparse/convert/detail/dense.cuh +++ b/cpp/include/raft/sparse/convert/detail/dense.cuh @@ -17,9 +17,9 @@ #pragma once #include -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh index 38a3c8f351..cbcbee0139 100644 --- a/cpp/include/raft/sparse/detail/coo.cuh +++ b/cpp/include/raft/sparse/detail/coo.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ #include -#include +#include #include #pragma once diff --git a/cpp/include/raft/sparse/detail/csr.cuh b/cpp/include/raft/sparse/detail/csr.cuh index 1fd2bb9366..c0985779f4 100644 --- a/cpp/include/raft/sparse/detail/csr.cuh +++ b/cpp/include/raft/sparse/detail/csr.cuh @@ -17,9 +17,9 @@ #pragma once #include -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h index b9c4a61850..041991521b 100644 --- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h index 29c823bcdb..a69352d74b 100644 --- a/cpp/include/raft/sparse/distance/common.h +++ b/cpp/include/raft/sparse/distance/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh index 7c1229b0d3..cdcb0b7322 100644 --- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh @@ -18,12 +18,12 @@ #include -#include -#include -#include +#include #include #include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh index 9edd1305b3..53ef0326fb 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh @@ -19,9 +19,9 @@ #include "coo_spmv_strategies/dense_smem_strategy.cuh" #include "coo_spmv_strategies/hash_strategy.cuh" -#include -#include #include +#include +#include #include "../../csr.hpp" #include "../../detail/utils.h" diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh index 0848d24bde..e791de10bb 100644 --- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh @@ -17,10 +17,10 @@ #pragma once #include -#include -#include -#include +#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index 234b08e933..1f55dadc58 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -18,15 +18,15 @@ #include -#include -#include -#include +#include #include #include #include #include #include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh index c6ff32caf3..0707eb2a9b 100644 --- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh @@ -18,9 +18,9 @@ #include -#include -#include -#include +#include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/operators.cuh b/cpp/include/raft/sparse/distance/detail/operators.cuh index b2c2e2172b..138b21e85b 100644 --- a/cpp/include/raft/sparse/distance/detail/operators.cuh +++ b/cpp/include/raft/sparse/distance/detail/operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh index ab189796ea..510e02822e 100644 --- a/cpp/include/raft/sparse/distance/distance.cuh +++ b/cpp/include/raft/sparse/distance/distance.cuh @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h index 1738dd7498..3c3b92b739 100644 --- a/cpp/include/raft/sparse/hierarchy/common.h +++ b/cpp/include/raft/sparse/hierarchy/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,39 +13,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#pragma once - -namespace raft { -namespace hierarchy { - -enum LinkageDistance { PAIRWISE = 0, KNN_GRAPH = 1 }; - /** - * Simple POCO for consolidating linkage results. This closely - * mirrors the trained instance variables populated in - * Scikit-learn's AgglomerativeClustering estimator. - * @tparam value_idx - * @tparam value_t + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -template -class linkage_output { - public: - value_idx m; - value_idx n_clusters; - - value_idx n_leaves; - value_idx n_connected_components; - value_idx* labels; // size: m +#pragma once - value_idx* children; // size: (m-1, 2) -}; +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use raft/cluster/single_linkage_types.hpp instead.") -class linkage_output_int_float : public linkage_output { -}; -class linkage_output__int64_float : public linkage_output { -}; +#include -}; // namespace hierarchy -}; // namespace raft \ No newline at end of file +namespace raft::hierarchy { +using raft::cluster::linkage_output; +using raft::cluster::linkage_output_int; +using raft::cluster::linkage_output_int64; +using raft::cluster::LinkageDistance; +} // namespace raft::hierarchy \ No newline at end of file diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh index 86940005b4..dbf353da73 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh @@ -13,53 +13,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef __SINGLE_LINKAGE_H -#define __SINGLE_LINKAGE_H +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. + */ #pragma once -#include -#include - -namespace raft { -namespace hierarchy { +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/cluster version instead.") -/** - * Single-linkage clustering, capable of constructing a KNN graph to - * scale the algorithm beyond the n^2 memory consumption of implementations - * that use the fully-connected graph of pairwise distances by connecting - * a knn graph when k is not large enough to connect it. - - * @tparam value_idx - * @tparam value_t - * @tparam dist_type method to use for constructing connectivities graph - * @param[in] handle raft handle - * @param[in] X dense input matrix in row-major layout - * @param[in] m number of rows in X - * @param[in] n number of columns in X - * @param[in] metric distance metrix to use when constructing connectivities graph - * @param[out] out struct containing output dendrogram and cluster assignments - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect - control - * of k. The algorithm will set `k = log(n) + c` - * @param[in] n_clusters number of clusters to assign data samples - */ -template -void single_linkage(const raft::handle_t& handle, - const value_t* X, - size_t m, - size_t n, - raft::distance::DistanceType metric, - linkage_output* out, - int c, - size_t n_clusters) -{ - detail::single_linkage( - handle, X, m, n, metric, out, c, n_clusters); -} -}; // namespace hierarchy -}; // namespace raft +#include +#include -#endif \ No newline at end of file +namespace raft::hierarchy { +using raft::cluster::single_linkage; +} \ No newline at end of file diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp index 80c3c3c521..72fe2e51a5 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp @@ -20,4 +20,8 @@ #pragma once -#include +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/cluster version instead.") + +#include diff --git a/cpp/include/raft/sparse/linalg/add.hpp b/cpp/include/raft/sparse/linalg/add.hpp index 39ab2d6450..e6930eaee7 100644 --- a/cpp/include/raft/sparse/linalg/add.hpp +++ b/cpp/include/raft/sparse/linalg/add.hpp @@ -18,87 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SPARSE_ADD_H -#define __SPARSE_ADD_H - -#pragma once - -#include - -namespace raft { -namespace sparse { -namespace linalg { - /** - * @brief Calculate the CSR row_ind array that would result - * from summing together two CSR matrices - * @param a_ind: left hand row_ind array - * @param a_indptr: left hand index_ptr array - * @param a_val: left hand data array - * @param nnz1: size of left hand index_ptr and val arrays - * @param b_ind: right hand row_ind array - * @param b_indptr: right hand index_ptr array - * @param b_val: right hand data array - * @param nnz2: size of right hand index_ptr and val arrays - * @param m: size of output array (number of rows in final matrix) - * @param out_ind: output row_ind array - * @param stream: cuda stream to use + * DISCLAIMER: this file is deprecated: use add.cuh instead */ -template -size_t csr_add_calc_inds(const int* a_ind, - const int* a_indptr, - const T* a_val, - int nnz1, - const int* b_ind, - const int* b_indptr, - const T* b_val, - int nnz2, - int m, - int* out_ind, - cudaStream_t stream) -{ - return detail::csr_add_calc_inds( - a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, out_ind, stream); -} -/** - * @brief Calculate the CSR row_ind array that would result - * from summing together two CSR matrices - * @param a_ind: left hand row_ind array - * @param a_indptr: left hand index_ptr array - * @param a_val: left hand data array - * @param nnz1: size of left hand index_ptr and val arrays - * @param b_ind: right hand row_ind array - * @param b_indptr: right hand index_ptr array - * @param b_val: right hand data array - * @param nnz2: size of right hand index_ptr and val arrays - * @param m: size of output array (number of rows in final matrix) - * @param c_ind: output row_ind array - * @param c_indptr: output ind_ptr array - * @param c_val: output data array - * @param stream: cuda stream to use - */ -template -void csr_add_finalize(const int* a_ind, - const int* a_indptr, - const T* a_val, - int nnz1, - const int* b_ind, - const int* b_indptr, - const T* b_val, - int nnz2, - int m, - int* c_ind, - int* c_indptr, - T* c_val, - cudaStream_t stream) -{ - detail::csr_add_finalize( - a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val, stream); -} +#pragma once -}; // end NAMESPACE linalg -}; // end NAMESPACE sparse -}; // end NAMESPACE raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "add.cuh" diff --git a/cpp/include/raft/sparse/linalg/degree.hpp b/cpp/include/raft/sparse/linalg/degree.hpp index 7cece7908e..240cfd452f 100644 --- a/cpp/include/raft/sparse/linalg/degree.hpp +++ b/cpp/include/raft/sparse/linalg/degree.hpp @@ -18,111 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SPARSE_DEGREE_H -#define __SPARSE_DEGREE_H - -#pragma once - -#include -#include - -namespace raft { -namespace sparse { -namespace linalg { - -/** - * @brief Count the number of values for each row - * @tparam TPB_X: number of threads to use per block - * @param rows: rows array of the COO matrix - * @param nnz: size of the rows array - * @param results: output result array - * @param stream: cuda stream to use - */ -template -void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream) -{ - detail::coo_degree<64, T>(rows, nnz, results, stream); -} - -/** - * @brief Count the number of values for each row - * @tparam TPB_X: number of threads to use per block - * @tparam T: type name of underlying values array - * @param in: input COO object for counting rows - * @param results: output array with row counts (size=in->n_rows) - * @param stream: cuda stream to use - */ -template -void coo_degree(COO* in, int* results, cudaStream_t stream) -{ - coo_degree(in->rows(), in->nnz, results, stream); -} - -/** - * @brief Count the number of values for each row that doesn't match a particular scalar - * @tparam TPB_X: number of threads to use per block - * @tparam T: the type name of the underlying value arrays - * @param rows: Input COO row array - * @param vals: Input COO val arrays - * @param nnz: size of input COO arrays - * @param scalar: scalar to match for counting rows - * @param results: output row counts - * @param stream: cuda stream to use - */ -template -void coo_degree_scalar( - const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0) -{ - detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream); -} - -/** - * @brief Count the number of values for each row that doesn't match a particular scalar - * @tparam TPB_X: number of threads to use per block - * @tparam T: the type name of the underlying value arrays - * @param in: Input COO array - * @param scalar: scalar to match for counting rows - * @param results: output row counts - * @param stream: cuda stream to use - */ -template -void coo_degree_scalar(COO* in, T scalar, int* results, cudaStream_t stream) -{ - coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, results, stream); -} - /** - * @brief Count the number of nonzeros for each row - * @tparam TPB_X: number of threads to use per block - * @tparam T: the type name of the underlying value arrays - * @param rows: Input COO row array - * @param vals: Input COO val arrays - * @param nnz: size of input COO arrays - * @param results: output row counts - * @param stream: cuda stream to use + * DISCLAIMER: this file is deprecated: use degree.cuh instead */ -template -void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream) -{ - detail::coo_degree_nz<64>(rows, vals, nnz, results, stream); -} -/** - * @brief Count the number of nonzero values for each row - * @tparam TPB_X: number of threads to use per block - * @tparam T: the type name of the underlying value arrays - * @param in: Input COO array - * @param results: output row counts - * @param stream: cuda stream to use - */ -template -void coo_degree_nz(COO* in, int* results, cudaStream_t stream) -{ - coo_degree_nz(in->rows(), in->vals(), in->nnz, results, stream); -} +#pragma once -}; // end NAMESPACE linalg -}; // end NAMESPACE sparse -}; // end NAMESPACE raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "degree.cuh" diff --git a/cpp/include/raft/sparse/linalg/detail/add.cuh b/cpp/include/raft/sparse/linalg/detail/add.cuh index 5c3d07fc02..ea1356938e 100644 --- a/cpp/include/raft/sparse/linalg/detail/add.cuh +++ b/cpp/include/raft/sparse/linalg/detail/add.cuh @@ -18,9 +18,9 @@ #include -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh index bf5484d3a4..86fcdb58d6 100644 --- a/cpp/include/raft/sparse/linalg/detail/degree.cuh +++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,8 @@ #pragma once -#include -#include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh index ba0ecd5dcc..c2a8aa4246 100644 --- a/cpp/include/raft/sparse/linalg/detail/norm.cuh +++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh @@ -17,9 +17,9 @@ #pragma once #include -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh index c295932719..cdc0e62130 100644 --- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh @@ -14,12 +14,12 @@ * limitations under the License. */ -#include +#include -#include #include #include #include +#include #include #include diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh index 9143aac84f..358e7d6d29 100644 --- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh @@ -18,14 +18,14 @@ #include -#include -#include #include +#include +#include #include #include -#include #include +#include #include #include diff --git a/cpp/include/raft/sparse/linalg/detail/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h index 4820b489d1..1484804348 100644 --- a/cpp/include/raft/sparse/linalg/detail/transpose.h +++ b/cpp/include/raft/sparse/linalg/detail/transpose.h @@ -18,9 +18,9 @@ #include -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/linalg/norm.hpp b/cpp/include/raft/sparse/linalg/norm.hpp index 1f054e63ab..64261f1178 100644 --- a/cpp/include/raft/sparse/linalg/norm.hpp +++ b/cpp/include/raft/sparse/linalg/norm.hpp @@ -18,61 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SPARSE_NORM_H -#define __SPARSE_NORM_H - -#pragma once - -#include - -namespace raft { -namespace sparse { -namespace linalg { - /** - * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix - * - * @param ia: row_ind array - * @param vals: data array - * @param nnz: size of data array - * @param m: size of row_ind array - * @param result: l1 normalized data array - * @param stream: cuda stream to use + * DISCLAIMER: this file is deprecated: use norm.cuh instead */ -template -void csr_row_normalize_l1(const int* ia, // csr row ex_scan (sorted by row) - const T* vals, - int nnz, // array of values and number of non-zeros - int m, // num rows in csr - T* result, - cudaStream_t stream) -{ // output array - detail::csr_row_normalize_l1(ia, vals, nnz, m, result, stream); -} -/** - * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix - * - * @param ia: row_ind array - * @param vals: data array - * @param nnz: size of data array - * @param m: size of row_ind array - * @param result: l1 normalized data array - * @param stream: cuda stream to use - */ -template -void csr_row_normalize_max(const int* ia, // csr row ind array (sorted by row) - const T* vals, - int nnz, // array of values and number of non-zeros - int m, // num total rows in csr - T* result, - cudaStream_t stream) -{ - detail::csr_row_normalize_max(ia, vals, nnz, m, result, stream); -} +#pragma once -}; // end NAMESPACE linalg -}; // end NAMESPACE sparse -}; // end NAMESPACE raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "norm.cuh" diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index fe95d1414c..0a97619e87 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -16,7 +16,7 @@ #ifndef __SPARSE_SPECTRAL_H #define __SPARSE_SPECTRAL_H -#include +#include #include namespace raft { diff --git a/cpp/include/raft/sparse/linalg/spectral.hpp b/cpp/include/raft/sparse/linalg/spectral.hpp index ff400f1f0f..d7009db03f 100644 --- a/cpp/include/raft/sparse/linalg/spectral.hpp +++ b/cpp/include/raft/sparse/linalg/spectral.hpp @@ -18,31 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SPARSE_SPECTRAL_H -#define __SPARSE_SPECTRAL_H - -#include -#include +/** + * DISCLAIMER: this file is deprecated: use spectral.cuh instead + */ -namespace raft { -namespace sparse { -namespace spectral { +#pragma once -template -void fit_embedding(const raft::handle_t& handle, - int* rows, - int* cols, - T* vals, - int nnz, - int n, - int n_components, - T* out, - unsigned long long seed = 1234567) -{ - detail::fit_embedding(handle, rows, cols, vals, nnz, n, n_components, out, seed); -} -}; // namespace spectral -}; // namespace sparse -}; // namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif +#include "spectral.cuh" diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh index 8f0105f512..fa0031aab6 100644 --- a/cpp/include/raft/sparse/linalg/transpose.cuh +++ b/cpp/include/raft/sparse/linalg/transpose.cuh @@ -18,7 +18,7 @@ #pragma once -#include +#include #include namespace raft { diff --git a/cpp/include/raft/sparse/linalg/transpose.hpp b/cpp/include/raft/sparse/linalg/transpose.hpp index c709c20473..a6a0539319 100644 --- a/cpp/include/raft/sparse/linalg/transpose.hpp +++ b/cpp/include/raft/sparse/linalg/transpose.hpp @@ -18,62 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __TRANSPOSE_H -#define __TRANSPOSE_H - -#pragma once - -#include -#include - -namespace raft { -namespace sparse { -namespace linalg { - /** - * Transpose a set of CSR arrays into a set of CSC arrays. - * @tparam value_idx : data type of the CSR index arrays - * @tparam value_t : data type of the CSR data array - * @param[in] handle : used for invoking cusparse - * @param[in] csr_indptr : CSR row index array - * @param[in] csr_indices : CSR column indices array - * @param[in] csr_data : CSR data array - * @param[out] csc_indptr : CSC row index array - * @param[out] csc_indices : CSC column indices array - * @param[out] csc_data : CSC data array - * @param[in] csr_nrows : Number of rows in CSR - * @param[in] csr_ncols : Number of columns in CSR - * @param[in] nnz : Number of nonzeros of CSR - * @param[in] stream : Cuda stream for ordering events + * DISCLAIMER: this file is deprecated: use transpose.cuh instead */ -template -void csr_transpose(const raft::handle_t& handle, - const value_idx* csr_indptr, - const value_idx* csr_indices, - const value_t* csr_data, - value_idx* csc_indptr, - value_idx* csc_indices, - value_t* csc_data, - value_idx csr_nrows, - value_idx csr_ncols, - value_idx nnz, - cudaStream_t stream) -{ - detail::csr_transpose(handle.get_cusparse_handle(), - csr_indptr, - csr_indices, - csr_data, - csc_indptr, - csc_indices, - csc_data, - csr_nrows, - csr_ncols, - nnz, - stream); -} -}; // end NAMESPACE linalg -}; // end NAMESPACE sparse -}; // end NAMESPACE raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "transpose.cuh" diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh index 70a6ff521f..8f1a365f3f 100644 --- a/cpp/include/raft/sparse/mst/mst.cuh +++ b/cpp/include/raft/sparse/mst/mst.cuh @@ -14,44 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef __MST_H -#define __MST_H +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. + */ #pragma once -#include "mst_solver.cuh" - -namespace raft { -namespace mst { - -template -raft::Graph_COO mst(const raft::handle_t& handle, - edge_t const* offsets, - vertex_t const* indices, - weight_t const* weights, - vertex_t const v, - edge_t const e, - vertex_t* color, - cudaStream_t stream, - bool symmetrize_output = true, - bool initialize_colors = true, - int iterations = 0) -{ - MST_solver mst_solver(handle, - offsets, - indices, - weights, - v, - e, - color, - stream, - symmetrize_output, - initialize_colors, - iterations); - return mst_solver.solve(); -} +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/sparse/solver version instead.") -} // namespace mst -} // namespace raft +#include +#include -#endif \ No newline at end of file +namespace raft::mst { +using raft::sparse::solver::mst; +} \ No newline at end of file diff --git a/cpp/include/raft/sparse/mst/mst.hpp b/cpp/include/raft/sparse/mst/mst.hpp index 5a66e8c815..1ad053d97c 100644 --- a/cpp/include/raft/sparse/mst/mst.hpp +++ b/cpp/include/raft/sparse/mst/mst.hpp @@ -21,4 +21,9 @@ */ #pragma once -#include "mst.cuh" \ No newline at end of file +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/sparse/solver version instead.") + +#include +#include diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh index bae5d77d8e..6af2226b99 100644 --- a/cpp/include/raft/sparse/mst/mst_solver.cuh +++ b/cpp/include/raft/sparse/mst/mst_solver.cuh @@ -1,6 +1,6 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,91 +15,22 @@ * limitations under the License. */ +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. + */ #pragma once -#include -#include -#include - -namespace raft { - -template -struct Graph_COO { - rmm::device_uvector src; - rmm::device_uvector dst; - rmm::device_uvector weights; - edge_t n_edges; - - Graph_COO(vertex_t size, cudaStream_t stream) - : src(size, stream), dst(size, stream), weights(size, stream) - { - } -}; - -namespace mst { - -template -class MST_solver { - public: - MST_solver(const raft::handle_t& handle_, - const edge_t* offsets_, - const vertex_t* indices_, - const weight_t* weights_, - const vertex_t v_, - const edge_t e_, - vertex_t* color_, - cudaStream_t stream_, - bool symmetrize_output_, - bool initialize_colors_, - int iterations_); - - raft::Graph_COO solve(); +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/sparse/solver version instead.") - ~MST_solver() {} +#include - private: - const raft::handle_t& handle; - cudaStream_t stream; - bool symmetrize_output, initialize_colors; - int iterations; - - // CSR - const edge_t* offsets; - const vertex_t* indices; - const weight_t* weights; - const vertex_t v; - const edge_t e; - - vertex_t max_blocks; - vertex_t max_threads; - vertex_t sm_count; - - vertex_t* color_index; // represent each supervertex as a color - rmm::device_uvector min_edge_color; // minimum incident edge weight per color - rmm::device_uvector new_mst_edge; // new minimum edge per vertex - rmm::device_uvector altered_weights; // weights to be used for mst - rmm::device_scalar mst_edge_count; // total number of edges added after every iteration - rmm::device_scalar - prev_mst_edge_count; // total number of edges up to the previous iteration - rmm::device_uvector mst_edge; // mst output - true if the edge belongs in mst - rmm::device_uvector next_color; // next iteration color - rmm::device_uvector color; // index of color that vertex points to - - // new src-dst pairs found per iteration - rmm::device_uvector temp_src; - rmm::device_uvector temp_dst; - rmm::device_uvector temp_weights; - - void label_prop(vertex_t* mst_src, vertex_t* mst_dst); - void min_edge_per_vertex(); - void min_edge_per_supervertex(); - void check_termination(); - void alteration(); - alteration_t alteration_max(); - void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights); -}; - -} // namespace mst -} // namespace raft +namespace raft { +using raft::sparse::solver::Graph_COO; +} -#include "detail/mst_solver_inl.cuh" +namespace raft::mst { +using raft::sparse::solver::MST_solver; +} \ No newline at end of file diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh index ca0ffe8180..bcc0301318 100644 --- a/cpp/include/raft/sparse/op/detail/filter.cuh +++ b/cpp/include/raft/sparse/op/detail/filter.cuh @@ -18,9 +18,9 @@ #include -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh index eb747cce1e..b4d8cb7db9 100644 --- a/cpp/include/raft/sparse/op/detail/reduce.cuh +++ b/cpp/include/raft/sparse/op/detail/reduce.cuh @@ -18,12 +18,12 @@ #include -#include -#include #include +#include +#include -#include #include +#include #include #include diff --git a/cpp/include/raft/sparse/op/detail/row_op.cuh b/cpp/include/raft/sparse/op/detail/row_op.cuh index 63c8cafaa7..5e7d2632a9 100644 --- a/cpp/include/raft/sparse/op/detail/row_op.cuh +++ b/cpp/include/raft/sparse/op/detail/row_op.cuh @@ -18,9 +18,9 @@ #include -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/op/detail/slice.cuh b/cpp/include/raft/sparse/op/detail/slice.cuh index 6bf6688076..193d246b4b 100644 --- a/cpp/include/raft/sparse/op/detail/slice.cuh +++ b/cpp/include/raft/sparse/op/detail/slice.cuh @@ -18,10 +18,10 @@ #include -#include -#include #include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h index 17dbf6a70d..2f73671132 100644 --- a/cpp/include/raft/sparse/op/detail/sort.h +++ b/cpp/include/raft/sparse/op/detail/sort.h @@ -16,11 +16,11 @@ #pragma once -#include -#include #include #include #include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 6c36538137..488d926fe9 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -18,7 +18,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/include/raft/sparse/op/filter.hpp b/cpp/include/raft/sparse/op/filter.hpp index 3821d963b0..6a59148fd7 100644 --- a/cpp/include/raft/sparse/op/filter.hpp +++ b/cpp/include/raft/sparse/op/filter.hpp @@ -18,82 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __FILTER_H -#define __FILTER_H - -#pragma once - -#include -#include -#include - -namespace raft { -namespace sparse { -namespace op { - -/** - * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix. - * - * @param rows: input array of rows (size n) - * @param cols: input array of cols (size n) - * @param vals: input array of vals (size n) - * @param nnz: size of current rows/cols/vals arrays - * @param crows: compressed array of rows - * @param ccols: compressed array of cols - * @param cvals: compressed array of vals - * @param cnnz: array of non-zero counts per row - * @param cur_cnnz array of counts per row - * @param scalar: scalar to remove from arrays - * @param n: number of rows in dense matrix - * @param stream: cuda stream to use - */ -template -void coo_remove_scalar(const int* rows, - const int* cols, - const T* vals, - int nnz, - int* crows, - int* ccols, - T* cvals, - int* cnnz, - int* cur_cnnz, - T scalar, - int n, - cudaStream_t stream) -{ - detail::coo_remove_scalar<128, T>( - rows, cols, vals, nnz, crows, ccols, cvals, cnnz, cur_cnnz, scalar, n, stream); -} - /** - * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix. - * - * @param in: input COO matrix - * @param out: output COO matrix - * @param scalar: scalar to remove from arrays - * @param stream: cuda stream to use + * DISCLAIMER: this file is deprecated: use filter.cuh instead */ -template -void coo_remove_scalar(COO* in, COO* out, T scalar, cudaStream_t stream) -{ - detail::coo_remove_scalar<128, T>(in, out, scalar, stream); -} -/** - * @brief Removes zeros from a COO formatted sparse matrix. - * - * @param in: input COO matrix - * @param out: output COO matrix - * @param stream: cuda stream to use - */ -template -void coo_remove_zeros(COO* in, COO* out, cudaStream_t stream) -{ - coo_remove_scalar(in, out, T(0.0), stream); -} +#pragma once -}; // namespace op -}; // end NAMESPACE sparse -}; // end NAMESPACE raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "filter.cuh" diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index fd860d2dc1..cd67e124ee 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -18,7 +18,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/include/raft/sparse/op/reduce.hpp b/cpp/include/raft/sparse/op/reduce.hpp index bb7560fa3d..37923e070c 100644 --- a/cpp/include/raft/sparse/op/reduce.hpp +++ b/cpp/include/raft/sparse/op/reduce.hpp @@ -18,75 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SPARSE_REDUCE_H -#define __SPARSE_REDUCE_H - -#pragma once - -#include -#include -#include - -namespace raft { -namespace sparse { -namespace op { /** - * Computes a mask from a sorted COO matrix where 0's denote - * duplicate values and 1's denote new values. This mask can - * be useful for computing an exclusive scan to pre-build offsets - * for reducing duplicates, such as when symmetrizing - * or taking the min of each duplicated value. - * - * Note that this function always marks the first value as 0 so that - * a cumulative sum can be performed as a follow-on. However, even - * if the mask is used direclty, any duplicates should always have a - * 1 when first encountered so it can be assumed that the first element - * is always a 1 otherwise. - * - * @tparam value_idx - * @param[out] mask output mask, size nnz - * @param[in] rows COO rows array, size nnz - * @param[in] cols COO cols array, size nnz - * @param[in] nnz number of nonzeros in input arrays - * @param[in] stream cuda ops will be ordered wrt this stream + * DISCLAIMER: this file is deprecated: use reduce.cuh instead */ -template -void compute_duplicates_mask( - value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream) -{ - detail::compute_duplicates_mask(mask, rows, cols, nnz, stream); -} -/** - * Performs a COO reduce of duplicate columns per row, taking the max weight - * for duplicate columns in each row. This function assumes the input COO - * has been sorted by both row and column but makes no assumption on - * the sorting of values. - * @tparam value_idx - * @tparam value_t - * @param[in] handle - * @param[out] out output COO, the nnz will be computed allocate() will be called in this function. - * @param[in] rows COO rows array, size nnz - * @param[in] cols COO cols array, size nnz - * @param[in] vals COO vals array, size nnz - * @param[in] nnz number of nonzeros in COO input arrays - * @param[in] m number of rows in COO input matrix - * @param[in] n number of columns in COO input matrix - */ -template -void max_duplicates(const raft::handle_t& handle, - raft::sparse::COO& out, - const value_idx* rows, - const value_idx* cols, - const value_t* vals, - size_t nnz, - size_t m, - size_t n) -{ - detail::max_duplicates(handle, out, rows, cols, vals, nnz, m, n); -} -}; // END namespace op -}; // END namespace sparse -}; // END namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "reduce.cuh" diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh index b31d3f29b6..d73d05785d 100644 --- a/cpp/include/raft/sparse/op/row_op.cuh +++ b/cpp/include/raft/sparse/op/row_op.cuh @@ -17,7 +17,7 @@ #define __SPARSE_ROW_OP_H #pragma once -#include +#include #include namespace raft { diff --git a/cpp/include/raft/sparse/op/row_op.hpp b/cpp/include/raft/sparse/op/row_op.hpp index ac12432e92..8443f9f090 100644 --- a/cpp/include/raft/sparse/op/row_op.hpp +++ b/cpp/include/raft/sparse/op/row_op.hpp @@ -18,37 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SPARSE_ROW_OP_H -#define __SPARSE_ROW_OP_H - -#pragma once - -#include -#include - -namespace raft { -namespace sparse { -namespace op { - /** - * @brief Perform a custom row operation on a CSR matrix in batches. - * @tparam T numerical type of row_ind array - * @tparam TPB_X number of threads per block to use for underlying kernel - * @tparam Lambda type of custom operation function - * @param row_ind the CSR row_ind array to perform parallel operations over - * @param n_rows total number vertices in graph - * @param nnz number of non-zeros - * @param op custom row operation functor accepting the row and beginning index. - * @param stream cuda stream to use + * DISCLAIMER: this file is deprecated: use row_op.cuh instead */ -template void> -void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream) -{ - detail::csr_row_op(row_ind, n_rows, nnz, op, stream); -} -}; // namespace op -}; // end NAMESPACE sparse -}; // end NAMESPACE raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "row_op.cuh" diff --git a/cpp/include/raft/sparse/op/slice.cuh b/cpp/include/raft/sparse/op/slice.cuh index cd7be1924b..30f7a97ffc 100644 --- a/cpp/include/raft/sparse/op/slice.cuh +++ b/cpp/include/raft/sparse/op/slice.cuh @@ -18,7 +18,7 @@ #pragma once -#include +#include #include namespace raft { diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp index 75b7e478e5..4d7e1858de 100644 --- a/cpp/include/raft/sparse/op/slice.hpp +++ b/cpp/include/raft/sparse/op/slice.hpp @@ -18,69 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SLICE_H -#define __SLICE_H - -#pragma once - -#include -#include - -namespace raft { -namespace sparse { -namespace op { - /** - * Slice consecutive rows from a CSR array and populate newly sliced indptr array - * @tparam value_idx - * @param[in] start_row : beginning row to slice - * @param[in] stop_row : ending row to slice - * @param[in] indptr : indptr of input CSR to slice - * @param[out] indptr_out : output sliced indptr to populate - * @param[in] start_offset : beginning column offset of input indptr - * @param[in] stop_offset : ending column offset of input indptr - * @param[in] stream : cuda stream for ordering events + * DISCLAIMER: this file is deprecated: use slice.cuh instead */ -template -void csr_row_slice_indptr(value_idx start_row, - value_idx stop_row, - const value_idx* indptr, - value_idx* indptr_out, - value_idx* start_offset, - value_idx* stop_offset, - cudaStream_t stream) -{ - detail::csr_row_slice_indptr( - start_row, stop_row, indptr, indptr_out, start_offset, stop_offset, stream); -} -/** - * Slice rows from a CSR, populate column and data arrays - * @tparam value_idx : data type of CSR index arrays - * @tparam value_t : data type of CSR data array - * @param[in] start_offset : beginning column offset to slice - * @param[in] stop_offset : ending column offset to slice - * @param[in] indices : column indices array from input CSR - * @param[in] data : data array from input CSR - * @param[out] indices_out : output column indices array - * @param[out] data_out : output data array - * @param[in] stream : cuda stream for ordering events - */ -template -void csr_row_slice_populate(value_idx start_offset, - value_idx stop_offset, - const value_idx* indices, - const value_t* data, - value_idx* indices_out, - value_t* data_out, - cudaStream_t stream) -{ - detail::csr_row_slice_populate( - start_offset, stop_offset, indices, data, indices_out, data_out, stream); -} +#pragma once -}; // namespace op -}; // end NAMESPACE sparse -}; // end NAMESPACE raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "slice.cuh" diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh index ae0e587c3b..ddb4b2830c 100644 --- a/cpp/include/raft/sparse/op/sort.cuh +++ b/cpp/include/raft/sparse/op/sort.cuh @@ -18,7 +18,7 @@ #pragma once -#include +#include #include namespace raft { diff --git a/cpp/include/raft/sparse/op/sort.hpp b/cpp/include/raft/sparse/op/sort.hpp index cd363582fb..867bb1bf35 100644 --- a/cpp/include/raft/sparse/op/sort.hpp +++ b/cpp/include/raft/sparse/op/sort.hpp @@ -18,66 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SPARSE_SORT_H -#define __SPARSE_SORT_H - -#pragma once - -#include -#include - -namespace raft { -namespace sparse { -namespace op { - /** - * @brief Sorts the arrays that comprise the coo matrix - * by row and then by column. - * - * @param m number of rows in coo matrix - * @param n number of cols in coo matrix - * @param nnz number of non-zeros - * @param rows rows array from coo matrix - * @param cols cols array from coo matrix - * @param vals vals array from coo matrix - * @param stream: cuda stream to use + * DISCLAIMER: this file is deprecated: use sort.cuh instead */ -template -void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream) -{ - detail::coo_sort(m, n, nnz, rows, cols, vals, stream); -} -/** - * @brief Sort the underlying COO arrays by row - * @tparam T: the type name of the underlying value array - * @param in: COO to sort by row - * @param stream: the cuda stream to use - */ -template -void coo_sort(COO* const in, cudaStream_t stream) -{ - coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream); -} +#pragma once -/** - * Sorts a COO by its weight - * @tparam value_idx - * @tparam value_t - * @param[inout] rows source edges - * @param[inout] cols dest edges - * @param[inout] data edge weights - * @param[in] nnz number of edges in edge list - * @param[in] stream cuda stream for which to order cuda operations - */ -template -void coo_sort_by_weight( - value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream) -{ - detail::coo_sort_by_weight(rows, cols, data, nnz, stream); -} -}; // namespace op -}; // end NAMESPACE sparse -}; // end NAMESPACE raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the cuh version instead.") -#endif \ No newline at end of file +#include "sort.cuh" diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 28bb5aa74b..22d8d7e936 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -13,70 +13,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef __CONNECT_COMPONENTS_H -#define __CONNECT_COMPONENTS_H - -#include -#include -#include - -namespace raft { -namespace linkage { - -template -using FixConnectivitiesRedOp = detail::FixConnectivitiesRedOp; - /** - * Gets the number of unique components from array of - * colors or labels. This does not assume the components are - * drawn from a monotonically increasing set. - * @tparam value_idx - * @param[in] colors array of components - * @param[in] n_rows size of components array - * @param[in] stream cuda stream for which to order cuda operations - * @return total number of components + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. */ -template -value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream) -{ - return detail::get_n_components(colors, n_rows, stream); -} /** - * Connects the components of an otherwise unconnected knn graph - * by computing a 1-nn to neighboring components of each data point - * (e.g. component(nn) != component(self)) and reducing the results to - * include the set of smallest destination components for each source - * component. The result will not necessarily contain - * n_components^2 - n_components number of elements because many components - * will likely not be contained in the neighborhoods of 1-nns. - * @tparam value_idx - * @tparam value_t - * @param[in] handle raft handle - * @param[out] out output edge list containing nearest cross-component - * edges. - * @param[in] X original (row-major) dense matrix for which knn graph should be constructed. - * @param[in] orig_colors array containing component number for each row of X - * @param[in] n_rows number of rows in X - * @param[in] n_cols number of cols in X - * @param[in] reduction_op - * @param[in] metric + * DISCLAIMER: this file is deprecated: use connect_components.cuh instead */ -template -void connect_components( - const raft::handle_t& handle, - raft::sparse::COO& out, - const value_t* X, - const value_idx* orig_colors, - size_t n_rows, - size_t n_cols, - red_op reduction_op, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) -{ - detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric); -} -}; // end namespace linkage -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the sparse/spatial version instead.") + +#include -#endif \ No newline at end of file +namespace raft::linkage { +using raft::sparse::spatial::connect_components; +using raft::sparse::spatial::FixConnectivitiesRedOp; +using raft::sparse::spatial::get_n_components; +} // namespace raft::linkage \ No newline at end of file diff --git a/cpp/include/raft/sparse/selection/connect_components.hpp b/cpp/include/raft/sparse/selection/connect_components.hpp index 25d71367db..393ed2d4e2 100644 --- a/cpp/include/raft/sparse/selection/connect_components.hpp +++ b/cpp/include/raft/sparse/selection/connect_components.hpp @@ -18,70 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __CONNECT_COMPONENTS_H -#define __CONNECT_COMPONENTS_H - -#include -#include -#include - -namespace raft { -namespace linkage { - -template -using FixConnectivitiesRedOp = detail::FixConnectivitiesRedOp; - /** - * Gets the number of unique components from array of - * colors or labels. This does not assume the components are - * drawn from a monotonically increasing set. - * @tparam value_idx - * @param[in] colors array of components - * @param[in] n_rows size of components array - * @param[in] stream cuda stream for which to order cuda operations - * @return total number of components + * DISCLAIMER: this file is deprecated: use connect_components.cuh instead */ -template -value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream) -{ - return detail::get_n_components(colors, n_rows, stream); -} -/** - * Connects the components of an otherwise unconnected knn graph - * by computing a 1-nn to neighboring components of each data point - * (e.g. component(nn) != component(self)) and reducing the results to - * include the set of smallest destination components for each source - * component. The result will not necessarily contain - * n_components^2 - n_components number of elements because many components - * will likely not be contained in the neighborhoods of 1-nns. - * @tparam value_idx - * @tparam value_t - * @param[in] handle raft handle - * @param[out] out output edge list containing nearest cross-component - * edges. - * @param[in] X original (row-major) dense matrix for which knn graph should be constructed. - * @param[in] orig_colors array containing component number for each row of X - * @param[in] n_rows number of rows in X - * @param[in] n_cols number of cols in X - * @param[in] reduction_op - * @param[in] metric - */ -template -void connect_components( - const raft::handle_t& handle, - raft::sparse::COO& out, - const value_t* X, - const value_idx* orig_colors, - size_t n_rows, - size_t n_cols, - red_op reduction_op, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) -{ - detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric); -} +#pragma once -}; // end namespace linkage -}; // end namespace raft +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the sparse/spatial version instead.") -#endif \ No newline at end of file +#include "connect_components.cuh" diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index fd9ab4ac3d..f6895addd1 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -13,90 +13,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef __SPARSE_KNN_H -#define __SPARSE_KNN_H - -#pragma once - -#include -#include -#include - -namespace raft { -namespace sparse { -namespace selection { +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. + */ /** - * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors - * using some distance implementation - * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) - * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) - * @param[in] idxData csr data array of the index matrix (size idxNNZ) - * @param[in] idxNNZ number of non-zeros for sparse index matrix - * @param[in] n_idx_rows number of data samples in index matrix - * @param[in] n_idx_cols - * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) - * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) - * @param[in] queryData csr data array of the query matrix (size queryNNZ) - * @param[in] queryNNZ number of non-zeros for sparse query matrix - * @param[in] n_query_rows number of data samples in query matrix - * @param[in] n_query_cols number of features in query matrix - * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) - * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) - * @param[in] k the number of neighbors to query - * @param[in] handle CUDA handle.get_stream() to order operations with respect to - * @param[in] batch_size_index maximum number of rows to use from index matrix per batch - * @param[in] batch_size_query maximum number of rows to use from query matrix per batch - * @param[in] metric distance metric/measure to use - * @param[in] metricArg potential argument for metric (currently unused) + * DISCLAIMER: this file is deprecated: use knn.cuh instead */ -template -void brute_force_knn(const value_idx* idxIndptr, - const value_idx* idxIndices, - const value_t* idxData, - size_t idxNNZ, - int n_idx_rows, - int n_idx_cols, - const value_idx* queryIndptr, - const value_idx* queryIndices, - const value_t* queryData, - size_t queryNNZ, - int n_query_rows, - int n_query_cols, - value_idx* output_indices, - value_t* output_dists, - int k, - const raft::handle_t& handle, - size_t batch_size_index = 2 << 14, // approx 1M - size_t batch_size_query = 2 << 14, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, - float metricArg = 0) -{ - detail::sparse_knn_t(idxIndptr, - idxIndices, - idxData, - idxNNZ, - n_idx_rows, - n_idx_cols, - queryIndptr, - queryIndices, - queryData, - queryNNZ, - n_query_rows, - n_query_cols, - output_indices, - output_dists, - k, - handle, - batch_size_index, - batch_size_query, - metric, - metricArg) - .run(); -} -}; // namespace selection -}; // namespace sparse -}; // namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the sparse/spatial version instead.") + +#include -#endif \ No newline at end of file +namespace raft::sparse::selection { +using raft::sparse::spatial::brute_force_knn; +} \ No newline at end of file diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp index bd6dd39fdf..cd5e7b1fa3 100644 --- a/cpp/include/raft/sparse/selection/knn.hpp +++ b/cpp/include/raft/sparse/selection/knn.hpp @@ -18,90 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __SPARSE_KNN_H -#define __SPARSE_KNN_H - -#pragma once - -#include -#include -#include - -namespace raft { -namespace sparse { -namespace selection { - /** - * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors - * using some distance implementation - * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) - * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) - * @param[in] idxData csr data array of the index matrix (size idxNNZ) - * @param[in] idxNNZ number of non-zeros for sparse index matrix - * @param[in] n_idx_rows number of data samples in index matrix - * @param[in] n_idx_cols - * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) - * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) - * @param[in] queryData csr data array of the query matrix (size queryNNZ) - * @param[in] queryNNZ number of non-zeros for sparse query matrix - * @param[in] n_query_rows number of data samples in query matrix - * @param[in] n_query_cols number of features in query matrix - * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) - * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) - * @param[in] k the number of neighbors to query - * @param[in] handle CUDA handle.get_stream() to order operations with respect to - * @param[in] batch_size_index maximum number of rows to use from index matrix per batch - * @param[in] batch_size_query maximum number of rows to use from query matrix per batch - * @param[in] metric distance metric/measure to use - * @param[in] metricArg potential argument for metric (currently unused) + * DISCLAIMER: this file is deprecated: use knn.cuh instead */ -template -void brute_force_knn(const value_idx* idxIndptr, - const value_idx* idxIndices, - const value_t* idxData, - size_t idxNNZ, - int n_idx_rows, - int n_idx_cols, - const value_idx* queryIndptr, - const value_idx* queryIndices, - const value_t* queryData, - size_t queryNNZ, - int n_query_rows, - int n_query_cols, - value_idx* output_indices, - value_t* output_dists, - int k, - const raft::handle_t& handle, - size_t batch_size_index = 2 << 14, // approx 1M - size_t batch_size_query = 2 << 14, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, - float metricArg = 0) -{ - detail::sparse_knn_t(idxIndptr, - idxIndices, - idxData, - idxNNZ, - n_idx_rows, - n_idx_cols, - queryIndptr, - queryIndices, - queryData, - queryNNZ, - n_query_rows, - n_query_cols, - output_indices, - output_dists, - k, - handle, - batch_size_index, - batch_size_query, - metric, - metricArg) - .run(); -} -}; // namespace selection -}; // namespace sparse -}; // namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the sparse/spatial version instead.") -#endif \ No newline at end of file +#include "knn.cuh" diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 7d342db43b..54cc52f4ae 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -13,51 +13,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef __KNN_GRAPH_H -#define __KNN_GRAPH_H +/** + * This file is deprecated and will be removed in release 22.06. + * Please use the cuh version instead. + */ -#pragma once +/** + * DISCLAIMER: this file is deprecated: use knn_graph.cuh instead + */ -#include -#include -#include +#pragma once -#include +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the sparse/spatial version instead.") -namespace raft { -namespace sparse { -namespace selection { +#include -/** - * Constructs a (symmetrized) knn graph edge list from - * dense input vectors. - * - * Note: The resulting KNN graph is not guaranteed to be connected. - * - * @tparam value_idx - * @tparam value_t - * @param[in] handle raft handle - * @param[in] X dense matrix of input data samples and observations - * @param[in] m number of data samples (rows) in X - * @param[in] n number of observations (columns) in X - * @param[in] metric distance metric to use when constructing neighborhoods - * @param[out] out output edge list - * @param c - */ -template -void knn_graph(const handle_t& handle, - const value_t* X, - std::size_t m, - std::size_t n, - raft::distance::DistanceType metric, - raft::sparse::COO& out, - int c = 15) -{ - detail::knn_graph(handle, X, m, n, metric, out, c); +namespace raft::sparse::selection { +using raft::sparse::spatial::knn_graph; } - -}; // namespace selection -}; // namespace sparse -}; // end namespace raft - -#endif \ No newline at end of file diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp index be47a6a9ef..e8236b1732 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.hpp +++ b/cpp/include/raft/sparse/selection/knn_graph.hpp @@ -18,51 +18,14 @@ * Please use the cuh version instead. */ -#ifndef __KNN_GRAPH_H -#define __KNN_GRAPH_H - -#pragma once - -#include -#include -#include - -#include - -namespace raft { -namespace sparse { -namespace selection { - /** - * Constructs a (symmetrized) knn graph edge list from - * dense input vectors. - * - * Note: The resulting KNN graph is not guaranteed to be connected. - * - * @tparam value_idx - * @tparam value_t - * @param[in] handle raft handle - * @param[in] X dense matrix of input data samples and observations - * @param[in] m number of data samples (rows) in X - * @param[in] n number of observations (columns) in X - * @param[in] metric distance metric to use when constructing neighborhoods - * @param[out] out output edge list - * @param c + * DISCLAIMER: this file is deprecated: use knn_graph.cuh instead */ -template -void knn_graph(const handle_t& handle, - const value_t* X, - std::size_t m, - std::size_t n, - raft::distance::DistanceType metric, - raft::sparse::COO& out, - int c = 15) -{ - detail::knn_graph(handle, X, m, n, metric, out, c); -} -}; // namespace selection -}; // namespace sparse -}; // end namespace raft +#pragma once + +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the sparse/spatial version instead.") -#endif \ No newline at end of file +#include "knn_graph.cuh" diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh new file mode 100644 index 0000000000..49f4e01362 --- /dev/null +++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh @@ -0,0 +1,1396 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// for cmath: +#define _USE_MATH_DEFINES + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace raft::sparse::solver::detail { + +// curandGeneratorNormalX +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev) +{ + return curandGenerateNormal(generator, outputPtr, n, mean, stddev); +} +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev) +{ + return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); +} + +// ========================================================= +// Helper functions +// ========================================================= + +/** + * @brief Perform Lanczos iteration + * Lanczos iteration is performed on a shifted matrix A+shift*I. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param iter Pointer to current Lanczos iteration. On exit, the + * variable is set equal to the final Lanczos iteration. + * @param maxIter Maximum Lanczos iteration. This function will + * perform a maximum of maxIter-*iter iterations. + * @param shift Matrix shift. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm (i.e. entry in beta_host) is + * less than tol. + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param alpha_host (Output, host memory, maxIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, maxIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Input/output, device memory, + * n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as + * columns of a column-major matrix with dimensions + * n x (maxIter+1). + * @param work_dev (Output, device memory, maxIter entries) + * Workspace. Not needed if full reorthogonalization is disabled. + * @return Zero if successful. Otherwise non-zero. + */ +template +int performLanczosIteration(handle_t const& handle, + spectral::matrix::sparse_matrix_t const* A, + index_type_t* iter, + index_type_t maxIter, + value_type_t shift, + value_type_t tol, + bool reorthogonalize, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful variables + constexpr value_type_t one = 1; + constexpr value_type_t negOne = -1; + constexpr value_type_t zero = 0; + value_type_t alpha; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); + + index_type_t n = A->nrows_; + + // ------------------------------------------------------- + // Compute second Lanczos vector + // ------------------------------------------------------- + if (*iter <= 0) { + *iter = 1; + + // Apply matrix + if (shift != 0) + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, + lanczosVecs_dev, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); + + // Orthogonalize Lanczos vector + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot( + cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); + + alpha = -alpha_host[0]; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy( + cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); + + // Check if Lanczos has converged + if (beta_host[0] <= tol) return 0; + + // Normalize Lanczos vector + alpha = 1 / beta_host[0]; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal( + cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + } + + // ------------------------------------------------------- + // Compute remaining Lanczos vectors + // ------------------------------------------------------- + + while (*iter < maxIter) { + ++(*iter); + + // Apply matrix + if (shift != 0) + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); + + // Full reorthogonalization + // "Twice is enough" algorithm per Kahan and Parlett + if (reorthogonalize) { + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(value_type_t), + cudaMemcpyDeviceToHost, + stream)); + + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + } + + // Orthogonalization with 3-term recurrence relation + else { + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); + + auto alpha = -alpha_host[*iter - 1]; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + alpha = -beta_host[*iter - 2]; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + } + + // Compute residual + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); + + // Check if Lanczos has converged + if (beta_host[*iter - 1] <= tol) break; + + // Normalize Lanczos vector + alpha = 1 / beta_host[*iter - 1]; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal( + cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + } + + handle.sync_stream(stream); + + return 0; +} + +/** + * @brief Find Householder transform for 3-dimensional system + * Given an input vector v=[x,y,z]', this function finds a + * Householder transform P such that P*v is a multiple of + * e_1=[1,0,0]'. The input vector v is overwritten with the + * Householder vector such that P=I-2*v*v'. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param v (Input/output, host memory, 3 entries) Input + * 3-dimensional vector. On exit, the vector is set to the + * Householder vector. + * @param Pv (Output, host memory, 1 entry) First entry of P*v + * (here v is the input vector). Either equal to ||v||_2 or + * -||v||_2. + * @param P (Output, host memory, 9 entries) Householder transform + * matrix. Matrix dimensions are 3 x 3. + */ +template +static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) +{ + // Compute norm of vector + *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + + // Choose whether to reflect to e_1 or -e_1 + // This choice avoids catastrophic cancellation + if (v[0] >= 0) *Pv = -(*Pv); + v[0] -= *Pv; + + // Normalize Householder vector + value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + if (normHouseholder != 0) { + v[0] /= normHouseholder; + v[1] /= normHouseholder; + v[2] /= normHouseholder; + } else { + v[0] = 0; + v[1] = 0; + v[2] = 0; + } + + // Construct Householder matrix + index_type_t i, j; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) + P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) + P[IDX(i, i, 3)] += 1; +} + +/** + * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix + * The Householder transform is pre-applied to the top three rows + * of the matrix and post-applied to the left three columns. The + * 4 x 4 matrix is intended to contain the bulge that is produced + * in the Francis QR algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param v (Input, host memory, 3 entries) Householder vector. + * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. + */ +template +static void applyHouseholder3(const value_type_t* v, value_type_t* A) +{ + // Loop indices + index_type_t i, j; + // Dot product between Householder vector and matrix row/column + value_type_t vDotA; + + // Pre-apply Householder transform + for (j = 0; j < 4; ++j) { + vDotA = 0; + for (i = 0; i < 3; ++i) + vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) + A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + } + + // Post-apply Householder transform + for (i = 0; i < 4; ++i) { + vDotA = 0; + for (j = 0; j < 3; ++j) + vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) + A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + } +} + +/** + * @brief Perform one step of Francis QR algorithm + * Equivalent to two steps of the classical QR algorithm on a + * tridiagonal matrix. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param n Matrix dimension. + * @param shift1 QR algorithm shift. + * @param shift2 QR algorithm shift. + * @param alpha (Input/output, host memory, n entries) Diagonal + * entries of tridiagonal matrix. + * @param beta (Input/output, host memory, n-1 entries) + * Off-diagonal entries of tridiagonal matrix. + * @param V (Input/output, host memory, n*n entries) Orthonormal + * transforms from previous steps of QR algorithm. Matrix + * dimensions are n x n. On exit, the orthonormal transform from + * this Francis QR step is post-applied to the matrix. + * @param work (Output, host memory, 3*n entries) Workspace. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int francisQRIteration(index_type_t n, + value_type_t shift1, + value_type_t shift2, + value_type_t* alpha, + value_type_t* beta, + value_type_t* V, + value_type_t* work) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Temporary storage of 4x4 bulge and Householder vector + value_type_t bulge[16]; + + // Householder vector + value_type_t householder[3]; + // Householder matrix + value_type_t householderMatrix[3 * 3]; + + // Shifts are roots of the polynomial p(x)=x^2+b*x+c + value_type_t b = -shift1 - shift2; + value_type_t c = shift1 * shift2; + + // Loop indices + index_type_t i, j, pos; + // Temporary variable + value_type_t temp; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute initial Householder transform + householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; + householder[1] = beta[0] * (alpha[0] + alpha[1] + b); + householder[2] = beta[0] * beta[1]; + findHouseholder3(householder, &temp, householderMatrix); + + // Apply initial Householder transform to create bulge + memset(bulge, 0, 16 * sizeof(value_type_t)); + for (i = 0; i < 4; ++i) + bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 3; ++i) { + bulge[IDX(i + 1, i, 4)] = beta[i]; + bulge[IDX(i, i + 1, 4)] = beta[i]; + } + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); + memcpy(V, work, 3 * n * sizeof(value_type_t)); + + // Chase bulge to bottom-right of matrix with Householder transforms + for (pos = 0; pos < n - 4; ++pos) { + // Move to next position + alpha[pos] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = beta[pos + 3]; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = beta[pos + 3]; + bulge[IDX(3, 3, 4)] = alpha[pos + 4]; + + // Apply Householder transform + findHouseholder3(householder, beta + pos, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); + } + + // Apply penultimate Householder transform + // Values in the last row and column are zero + alpha[n - 4] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = 0; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = 0; + bulge[IDX(3, 3, 4)] = 0; + findHouseholder3(householder, beta + n - 4, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); + + // Apply final Householder transform + // Values in the last two rows and columns are zero + alpha[n - 3] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = 0; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); + + // Bulge has been eliminated + alpha[n - 2] = bulge[IDX(0, 0, 4)]; + alpha[n - 1] = bulge[IDX(1, 1, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; + + return 0; +} + +/** + * @brief Perform implicit restart of Lanczos algorithm + * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Matrix dimension. + * @param iter Current Lanczos iteration. + * @param iter_new Lanczos iteration after restart. + * @param shiftUpper Pointer (host memory) to upper bound for unwanted + * region. Value is ignored if less than *shiftLower. If a + * stronger upper bound has been found, the value is updated on + * exit. + * @param shiftLower Pointer (host memory) to lower bound for unwanted + * region. Value is ignored if greater than *shiftUpper. If a + * stronger lower bound has been found, the value is updated on + * exit. + * @param alpha_host (Input/output, host memory, iter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Input/output, host memory, iter entries) + * Off-diagonal entries of Lanczos system. + * @param V_host (Output, host memory, iter*iter entries) + * Orthonormal transform used to obtain restarted system. Matrix + * dimensions are iter x iter. + * @param work_host (Output, host memory, 4*iter entries) + * Workspace. + * @param lanczosVecs_dev (Input/output, device memory, n*(iter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (iter+1). + * @param work_dev (Output, device memory, (n+iter)*iter entries) + * Workspace. + * @param smallest_eig specifies whether smallest (true) or largest + * (false) eigenvalues are to be calculated. + * @return error flag. + */ +template +static int lanczosRestart(handle_t const& handle, + index_type_t n, + index_type_t iter, + index_type_t iter_new, + value_type_t* shiftUpper, + value_type_t* shiftLower, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ V_host, + value_type_t* __restrict__ work_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + bool smallest_eig) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful constants + constexpr value_type_t zero = 0; + constexpr value_type_t one = 1; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Loop index + index_type_t i; + + // Number of implicit restart steps + // Assumed to be even since each call to Francis algorithm is + // equivalent to two calls of QR algorithm + index_type_t restartSteps = iter - iter_new; + + // Ritz values from Lanczos method + value_type_t* ritzVals_host = work_host + 3 * iter; + // Shifts for implicit restart + value_type_t* shifts_host; + + // Orthonormal matrix for similarity transform + value_type_t* V_dev = work_dev + n * iter; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute Ritz values + memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t)); + memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t)); + Lapack::sterf(iter, ritzVals_host, work_host); + + // Debug: Print largest eigenvalues + // for (int i = iter-iter_new; i < iter; ++i) + // std::cout <<*(ritzVals_host+i)<< " "; + // std::cout < *shiftUpper) { + *shiftUpper = ritzVals_host[iter - 1]; + *shiftLower = ritzVals_host[iter_new]; + } else { + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]); + } + } else { + if (*shiftLower > *shiftUpper) { + *shiftUpper = ritzVals_host[iter - iter_new - 1]; + *shiftLower = ritzVals_host[0]; + } else { + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[0]); + } + } + + // Calculate Chebyshev nodes as shifts + shifts_host = ritzVals_host; + for (i = 0; i < restartSteps; ++i) { + shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); + shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); + } + + // Apply Francis QR algorithm to implicitly restart Lanczos + for (i = 0; i < restartSteps; i += 2) + if (francisQRIteration( + iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) + WARNING("error in implicitly shifted QR algorithm"); + + // Obtain new residual + RAFT_CUDA_TRY(cudaMemcpyAsync( + V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1, + stream)); + + // Obtain new Lanczos vectors + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + iter_new, + iter, + &one, + lanczosVecs_dev, + n, + V_dev, + iter, + &zero, + work_dev, + n, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, + work_dev, + n * iter_new * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + + // Normalize residual to obtain new Lanczos vector + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); + + auto h_beta = 1 / beta_host[iter_new - 1]; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal( + cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + + return 0; +} + +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param shift On exit, pointer to matrix shift (estimate for + * largest eigenvalue). + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ +template +int computeSmallestEigenvectors( + handle_t const& handle, + spectral::matrix::sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* shift, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ + // Useful constants + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; + + // Matrix dimension + index_type_t n = A->nrows_; + + // Shift for implicit restart + value_type_t shiftUpper; + value_type_t shiftLower; + + // Lanczos iteration counters + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + index_type_t i; + + // Host memory + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // ------------------------------------------------------- + // Variable initialization + // ------------------------------------------------------- + + // Total number of Lanczos iterations + *totalIter = 0; + + // Allocate host memory + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // ------------------------------------------------------- + // Compute largest eigenvalue to determine shift + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); + + curandSetPseudoRandomGeneratorSeed(randGen, seed); + + // Initialize initial Lanczos vector + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); + value_type_t normQ1; + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + *shift = 0; + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0.0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + + // Determine largest eigenvalue + + Lapack::sterf(*effIter, alpha_host, beta_host); + *shift = -alpha_host[*effIter - 1]; + + // ------------------------------------------------------- + // Compute eigenvectors of shifted matrix + // ------------------------------------------------------- + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + shiftLower = 1; + shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + index_type_t iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + true); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, + work_host); + + // Obtain desired eigenvalues by applying shift + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) + work_host[i + 2 * (*effIter)] = 0; + + // Copy results to device memory + RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter), + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host, + (*effIter) * nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); + CHECK_CUDA(stream); + + // Convert eigenvectors from Lanczos basis to standard basis + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); + + // Clean up and exit + curandDestroyGenerator(randGen); + return 0; +} + +template +int computeSmallestEigenvectors( + handle_t const& handle, + spectral::matrix::sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 1234567) +{ + // Matrix dimension + index_type_t n = A.nrows_; + + // Check that parameters are valid + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + // Allocate memory + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); + + spectral::matrix::vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + spectral::matrix::vector_t work_dev(handle, (n + restartIter) * restartIter); + + // Perform Lanczos method + index_type_t effIter; + value_type_t shift; + int status = computeSmallestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); + + // Clean up and return + return status; +} + +/** + * @brief Compute largest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the largest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th largest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to largest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ +template +int computeLargestEigenvectors( + handle_t const& handle, + spectral::matrix::sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ + // Useful constants + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; + + // Matrix dimension + index_type_t n = A->nrows_; + + // Lanczos iteration counters + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + index_type_t i; + + // Host memory + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace + + // ------------------------------------------------------- + // Check that LAPACK is enabled + // ------------------------------------------------------- + // Lapack::check_lapack_enabled(); + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // ------------------------------------------------------- + // Variable initialization + // ------------------------------------------------------- + + // Total number of Lanczos iterations + *totalIter = 0; + + // Allocate host memory + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // ------------------------------------------------------- + // Compute largest eigenvalue + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); + curandSetPseudoRandomGeneratorSeed(randGen, seed); + // Initialize initial Lanczos vector + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); + value_type_t normQ1; + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + value_type_t shift_val = 0.0; + value_type_t* shift = &shift_val; + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + value_type_t shiftLower = 1; + value_type_t shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + index_type_t iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + false); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + for (int i = 0; i < restartIter; ++i) { + for (int j = 0; j < restartIter; ++j) + Z_host[i * restartIter + j] = 0; + } + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, + work_host); + + // note: We need to pick the top nEigVecs eigenvalues + // but effItter can be larger than nEigVecs + // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the + // matrix of size effIter. remember the array is sorted, so it is not needed for smallest + // eigenvalues case because the first ones are the smallest ones + + index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs; + + // Debug : print nEigVecs largest eigenvalues + // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i) + // std::cout <<*(work_host+(2*(*effIter)+i))<< " "; + // std::cout < +int computeLargestEigenvectors( + handle_t const& handle, + spectral::matrix::sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 123456) +{ + // Matrix dimension + index_type_t n = A.nrows_; + + // Check that parameters are valid + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + // Allocate memory + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); + + spectral::matrix::vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + spectral::matrix::vector_t work_dev(handle, (n + restartIter) * restartIter); + + // Perform Lanczos method + index_type_t effIter; + int status = computeLargestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); + + // Clean up and return + return status; +} + +} // namespace raft::sparse::solver::detail diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/solver/detail/mst_kernels.cuh similarity index 98% rename from cpp/include/raft/sparse/mst/detail/mst_kernels.cuh rename to cpp/include/raft/sparse/solver/detail/mst_kernels.cuh index 36d426029b..916690be67 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh +++ b/cpp/include/raft/sparse/solver/detail/mst_kernels.cuh @@ -1,6 +1,6 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,15 +17,13 @@ #pragma once -#include "utils.cuh" +#include #include -#include +#include -namespace raft { -namespace mst { -namespace detail { +namespace raft::sparse::solver::detail { template __global__ void kernel_min_edge_per_vertex(const edge_t* offsets, @@ -332,6 +330,4 @@ __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src, if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); } } -} // namespace detail -} // namespace mst -} // namespace raft +} // namespace raft::sparse::solver::detail diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh similarity index 98% rename from cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh rename to cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh index fa8ecf2563..be8b696bca 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh @@ -18,10 +18,10 @@ #include -#include "mst_kernels.cuh" -#include "utils.cuh" +#include +#include -#include +#include #include #include @@ -43,8 +43,7 @@ #include -namespace raft { -namespace mst { +namespace raft::sparse::solver { // curand generator uniform inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, @@ -115,8 +114,7 @@ MST_solver::MST_solver(const raft::han } template -raft::Graph_COO -MST_solver::solve() +Graph_COO MST_solver::solve() { RAFT_EXPECTS(v > 0, "0 vertices"); RAFT_EXPECTS(e > 0, "0 edges"); @@ -409,6 +407,4 @@ void MST_solver::append_src_dst_pair( src_dst_zip_end, new_edges_functor()); } - -} // namespace mst -} // namespace raft +} // namespace raft::sparse::solver diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/solver/detail/mst_utils.cuh similarity index 87% rename from cpp/include/raft/sparse/mst/detail/utils.cuh rename to cpp/include/raft/sparse/solver/detail/mst_utils.cuh index 94ddf4ed94..a33141192b 100644 --- a/cpp/include/raft/sparse/mst/detail/utils.cuh +++ b/cpp/include/raft/sparse/solver/detail/mst_utils.cuh @@ -20,9 +20,7 @@ #include #include -namespace raft { -namespace mst { -namespace detail { +namespace raft::sparse::solver::detail { template __device__ idx_t get_1D_idx() @@ -30,6 +28,4 @@ __device__ idx_t get_1D_idx() return blockIdx.x * blockDim.x + threadIdx.x; } -} // namespace detail -} // namespace mst -} // namespace raft +} // namespace raft::sparse::solver::detail diff --git a/cpp/include/raft/sparse/solver/lanczos.cuh b/cpp/include/raft/sparse/solver/lanczos.cuh new file mode 100644 index 0000000000..9b5301988a --- /dev/null +++ b/cpp/include/raft/sparse/solver/lanczos.cuh @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __LANCZOS_H +#define __LANCZOS_H + +#pragma once + +#include +#include + +namespace raft::sparse::solver { + +// ========================================================= +// Eigensolver +// ========================================================= + +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param iter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Smallest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ +template +int computeSmallestEigenvectors( + handle_t const& handle, + raft::spectral::matrix::sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 1234567) +{ + return detail::computeSmallestEigenvectors(handle, + A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + iter, + eigVals_dev, + eigVecs_dev, + seed); +} + +/** + * @brief Compute largest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the largest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th largest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param iter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to largest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ +template +int computeLargestEigenvectors( + handle_t const& handle, + raft::spectral::matrix::sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 123456) +{ + return detail::computeLargestEigenvectors(handle, + A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + iter, + eigVals_dev, + eigVecs_dev, + seed); +} + +} // namespace raft::sparse::solver + +#endif \ No newline at end of file diff --git a/cpp/include/raft/sparse/solver/mst.cuh b/cpp/include/raft/sparse/solver/mst.cuh new file mode 100644 index 0000000000..33beeb1915 --- /dev/null +++ b/cpp/include/raft/sparse/solver/mst.cuh @@ -0,0 +1,50 @@ + +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace raft::sparse::solver { + +template +Graph_COO mst(const raft::handle_t& handle, + edge_t const* offsets, + vertex_t const* indices, + weight_t const* weights, + vertex_t const v, + edge_t const e, + vertex_t* color, + cudaStream_t stream, + bool symmetrize_output = true, + bool initialize_colors = true, + int iterations = 0) +{ + MST_solver mst_solver(handle, + offsets, + indices, + weights, + v, + e, + color, + stream, + symmetrize_output, + initialize_colors, + iterations); + return mst_solver.solve(); +} + +} // end namespace raft::sparse::solver diff --git a/cpp/include/raft/sparse/solver/mst_solver.cuh b/cpp/include/raft/sparse/solver/mst_solver.cuh new file mode 100644 index 0000000000..a10b74d77b --- /dev/null +++ b/cpp/include/raft/sparse/solver/mst_solver.cuh @@ -0,0 +1,102 @@ + +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft::sparse::solver { + +template +struct Graph_COO { + rmm::device_uvector src; + rmm::device_uvector dst; + rmm::device_uvector weights; + edge_t n_edges; + + Graph_COO(vertex_t size, cudaStream_t stream) + : src(size, stream), dst(size, stream), weights(size, stream) + { + } +}; + +template +class MST_solver { + public: + MST_solver(const raft::handle_t& handle_, + const edge_t* offsets_, + const vertex_t* indices_, + const weight_t* weights_, + const vertex_t v_, + const edge_t e_, + vertex_t* color_, + cudaStream_t stream_, + bool symmetrize_output_, + bool initialize_colors_, + int iterations_); + + Graph_COO solve(); + + ~MST_solver() {} + + private: + const raft::handle_t& handle; + cudaStream_t stream; + bool symmetrize_output, initialize_colors; + int iterations; + + // CSR + const edge_t* offsets; + const vertex_t* indices; + const weight_t* weights; + const vertex_t v; + const edge_t e; + + vertex_t max_blocks; + vertex_t max_threads; + vertex_t sm_count; + + vertex_t* color_index; // represent each supervertex as a color + rmm::device_uvector min_edge_color; // minimum incident edge weight per color + rmm::device_uvector new_mst_edge; // new minimum edge per vertex + rmm::device_uvector altered_weights; // weights to be used for mst + rmm::device_scalar mst_edge_count; // total number of edges added after every iteration + rmm::device_scalar + prev_mst_edge_count; // total number of edges up to the previous iteration + rmm::device_uvector mst_edge; // mst output - true if the edge belongs in mst + rmm::device_uvector next_color; // next iteration color + rmm::device_uvector color; // index of color that vertex points to + + // new src-dst pairs found per iteration + rmm::device_uvector temp_src; + rmm::device_uvector temp_dst; + rmm::device_uvector temp_weights; + + void label_prop(vertex_t* mst_src, vertex_t* mst_dst); + void min_edge_per_vertex(); + void min_edge_per_supervertex(); + void check_termination(); + void alteration(); + alteration_t alteration_max(); + void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights); +}; + +} // namespace raft::sparse::solver + +#include diff --git a/cpp/include/raft/sparse/spatial/connect_components.cuh b/cpp/include/raft/sparse/spatial/connect_components.cuh new file mode 100644 index 0000000000..60c0bba1de --- /dev/null +++ b/cpp/include/raft/sparse/spatial/connect_components.cuh @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft::sparse::spatial { + +template +using FixConnectivitiesRedOp = detail::FixConnectivitiesRedOp; + +/** + * Gets the number of unique components from array of + * colors or labels. This does not assume the components are + * drawn from a monotonically increasing set. + * @tparam value_idx + * @param[in] colors array of components + * @param[in] n_rows size of components array + * @param[in] stream cuda stream for which to order cuda operations + * @return total number of components + */ +template +value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream) +{ + return detail::get_n_components(colors, n_rows, stream); +} + +/** + * Connects the components of an otherwise unconnected knn graph + * by computing a 1-nn to neighboring components of each data point + * (e.g. component(nn) != component(self)) and reducing the results to + * include the set of smallest destination components for each source + * component. The result will not necessarily contain + * n_components^2 - n_components number of elements because many components + * will likely not be contained in the neighborhoods of 1-nns. + * @tparam value_idx + * @tparam value_t + * @param[in] handle raft handle + * @param[out] out output edge list containing nearest cross-component + * edges. + * @param[in] X original (row-major) dense matrix for which knn graph should be constructed. + * @param[in] orig_colors array containing component number for each row of X + * @param[in] n_rows number of rows in X + * @param[in] n_cols number of cols in X + * @param[in] reduction_op + * @param[in] metric + */ +template +void connect_components( + const raft::handle_t& handle, + raft::sparse::COO& out, + const value_t* X, + const value_idx* orig_colors, + size_t n_rows, + size_t n_cols, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) +{ + detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric); +} + +}; // end namespace raft::sparse::spatial \ No newline at end of file diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/spatial/detail/connect_components.cuh similarity index 98% rename from cpp/include/raft/sparse/selection/detail/connect_components.cuh rename to cpp/include/raft/sparse/spatial/detail/connect_components.cuh index 92d06197cd..f515ab5739 100644 --- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh +++ b/cpp/include/raft/sparse/spatial/detail/connect_components.cuh @@ -13,9 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #include +#include #include #include #include @@ -24,7 +26,7 @@ #include #include -#include +#include #include #include @@ -42,10 +44,7 @@ #include -namespace raft { -namespace linkage { -namespace detail { - +namespace raft::sparse::spatial::detail { /** * \brief A key identifier paired with a corresponding value * @@ -438,6 +437,4 @@ void connect_components( handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out); } -}; // end namespace detail -}; // end namespace linkage -}; // end namespace raft +}; // end namespace raft::sparse::spatial::detail diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/spatial/detail/knn.cuh similarity index 98% rename from cpp/include/raft/sparse/selection/detail/knn.cuh rename to cpp/include/raft/sparse/spatial/detail/knn.cuh index b1dd6116e7..aa933cd680 100644 --- a/cpp/include/raft/sparse/selection/detail/knn.cuh +++ b/cpp/include/raft/sparse/spatial/detail/knn.cuh @@ -18,11 +18,11 @@ #include -#include -#include -#include +#include #include #include +#include +#include #include #include @@ -33,10 +33,7 @@ #include -namespace raft { -namespace sparse { -namespace selection { -namespace detail { +namespace raft::sparse::spatial::detail { template struct csr_batcher_t { @@ -428,7 +425,4 @@ class sparse_knn_t { const raft::handle_t& handle; }; -}; // namespace detail -}; // namespace selection -}; // namespace sparse -}; // namespace raft +}; // namespace raft::sparse::spatial::detail \ No newline at end of file diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/spatial/detail/knn_graph.cuh similarity index 94% rename from cpp/include/raft/sparse/selection/detail/knn_graph.cuh rename to cpp/include/raft/sparse/spatial/detail/knn_graph.cuh index 32b7fd3c63..1331393719 100644 --- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh +++ b/cpp/include/raft/sparse/spatial/detail/knn_graph.cuh @@ -16,15 +16,15 @@ #pragma once -#include -#include +#include +#include #include #include #include -#include +#include #include #include @@ -35,10 +35,7 @@ #include #include -namespace raft { -namespace sparse { -namespace selection { -namespace detail { +namespace raft::sparse::spatial::detail { /** * Fills indices array of pairwise distance array @@ -150,7 +147,4 @@ void knn_graph(const handle_t& handle, handle, rows.data(), indices.data(), data.data(), m, k, nnz, out); } -}; // namespace detail -}; // namespace selection -}; // namespace sparse -}; // end namespace raft +}; // namespace raft::sparse::spatial::detail diff --git a/cpp/include/raft/sparse/spatial/knn.cuh b/cpp/include/raft/sparse/spatial/knn.cuh new file mode 100644 index 0000000000..1e8a08ec96 --- /dev/null +++ b/cpp/include/raft/sparse/spatial/knn.cuh @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace raft::sparse::spatial { + +/** + * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors + * using some distance implementation + * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) + * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) + * @param[in] idxData csr data array of the index matrix (size idxNNZ) + * @param[in] idxNNZ number of non-zeros for sparse index matrix + * @param[in] n_idx_rows number of data samples in index matrix + * @param[in] n_idx_cols + * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) + * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) + * @param[in] queryData csr data array of the query matrix (size queryNNZ) + * @param[in] queryNNZ number of non-zeros for sparse query matrix + * @param[in] n_query_rows number of data samples in query matrix + * @param[in] n_query_cols number of features in query matrix + * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) + * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) + * @param[in] k the number of neighbors to query + * @param[in] handle CUDA handle.get_stream() to order operations with respect to + * @param[in] batch_size_index maximum number of rows to use from index matrix per batch + * @param[in] batch_size_query maximum number of rows to use from query matrix per batch + * @param[in] metric distance metric/measure to use + * @param[in] metricArg potential argument for metric (currently unused) + */ +template +void brute_force_knn(const value_idx* idxIndptr, + const value_idx* idxIndices, + const value_t* idxData, + size_t idxNNZ, + int n_idx_rows, + int n_idx_cols, + const value_idx* queryIndptr, + const value_idx* queryIndices, + const value_t* queryData, + size_t queryNNZ, + int n_query_rows, + int n_query_cols, + value_idx* output_indices, + value_t* output_dists, + int k, + const raft::handle_t& handle, + size_t batch_size_index = 2 << 14, // approx 1M + size_t batch_size_query = 2 << 14, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, + float metricArg = 0) +{ + detail::sparse_knn_t(idxIndptr, + idxIndices, + idxData, + idxNNZ, + n_idx_rows, + n_idx_cols, + queryIndptr, + queryIndices, + queryData, + queryNNZ, + n_query_rows, + n_query_cols, + output_indices, + output_dists, + k, + handle, + batch_size_index, + batch_size_query, + metric, + metricArg) + .run(); +} + +}; // namespace raft::sparse::spatial diff --git a/cpp/include/raft/sparse/spatial/knn_graph.cuh b/cpp/include/raft/sparse/spatial/knn_graph.cuh new file mode 100644 index 0000000000..9694e6a293 --- /dev/null +++ b/cpp/include/raft/sparse/spatial/knn_graph.cuh @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace raft::sparse::spatial { + +/** + * Constructs a (symmetrized) knn graph edge list from + * dense input vectors. + * + * Note: The resulting KNN graph is not guaranteed to be connected. + * + * @tparam value_idx + * @tparam value_t + * @param[in] handle raft handle + * @param[in] X dense matrix of input data samples and observations + * @param[in] m number of data samples (rows) in X + * @param[in] n number of observations (columns) in X + * @param[in] metric distance metric to use when constructing neighborhoods + * @param[out] out output edge list + * @param c + */ +template +void knn_graph(const handle_t& handle, + const value_t* X, + std::size_t m, + std::size_t n, + raft::distance::DistanceType metric, + raft::sparse::COO& out, + int c = 15) +{ + detail::knn_graph(handle, X, m, n, metric, out, c); +} + +}; // namespace raft::sparse::spatial diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index 45867dbfee..a0d79a1b77 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -23,7 +23,7 @@ #include "detail/processing.hpp" #include "ivf_flat_types.hpp" -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/ann_types.hpp b/cpp/include/raft/spatial/knn/ann_types.hpp new file mode 100644 index 0000000000..6e9a00bc0c --- /dev/null +++ b/cpp/include/raft/spatial/knn/ann_types.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::spatial::knn { + +/** The base for approximate KNN index structures. */ +struct index { +}; + +/** The base for KNN index parameters. */ +struct index_params { + /** Distance type. */ + raft::distance::DistanceType metric = distance::DistanceType::L2Expanded; + /** The argument used by some distance metrics. */ + float metric_arg = 2.0f; + /** + * Whether to add the dataset content to the index, i.e.: + * + * - `true` means the index is filled with the dataset vectors and ready to search after calling + * `build`. + * - `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but + * the index is left empty; you'd need to call `extend` on the index afterwards to populate it. + */ + bool add_data_on_build = true; +}; + +struct search_params { +}; + +}; // namespace raft::spatial::knn diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh index 62cd5aa45c..9cb9b573b1 100644 --- a/cpp/include/raft/spatial/knn/ball_cover.cuh +++ b/cpp/include/raft/spatial/knn/ball_cover.cuh @@ -20,26 +20,35 @@ #include -#include "ball_cover_common.h" +#include "ball_cover_types.hpp" #include "detail/ball_cover.cuh" #include "detail/ball_cover/common.cuh" -#include +#include #include namespace raft { namespace spatial { namespace knn { -template +/** + * Builds and populates a previously unbuilt BallCoverIndex + * @tparam idx_t knn index type + * @tparam value_t knn value type + * @tparam int_t integral type for knn params + * @tparam matrix_idx_t matrix indexing type + * @param[in] handle library resource management handle + * @param[inout] index an empty (and not previous built) instance of BallCoverIndex + */ +template void rbc_build_index(const raft::handle_t& handle, - BallCoverIndex& index) + BallCoverIndex& index) { ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation"); if (index.metric == raft::distance::DistanceType::Haversine) { - detail::rbc_build_index(handle, index, detail::HaversineFunc()); + detail::rbc_build_index(handle, index, detail::HaversineFunc()); } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) { - detail::rbc_build_index(handle, index, detail::EuclideanFunc()); + detail::rbc_build_index(handle, index, detail::EuclideanFunc()); } else { RAFT_FAIL("Metric not support"); } @@ -55,18 +64,18 @@ void rbc_build_index(const raft::handle_t& handle, * the index and query are the same array. This function will * build the index and assumes rbc_build_index() has not already * been called. - * @tparam value_idx knn index type + * @tparam idx_t knn index type * @tparam value_t knn distance type - * @tparam value_int type for integers, such as number of rows/cols - * @param handle raft handle for resource management - * @param index ball cover index which has not yet been built - * @param k number of nearest neighbors to find - * @param perform_post_filtering if this is false, only the closest k landmarks + * @tparam int_t type for integers, such as number of rows/cols + * @param[in] handle raft handle for resource management + * @param[inout] index ball cover index which has not yet been built + * @param[in] k number of nearest neighbors to find + * @param[in] perform_post_filtering if this is false, only the closest k landmarks * are considered (which will return approximate * results). * @param[out] inds output knn indices * @param[out] dists output knn distances - * @param weight a weight for overlap between the closest landmark and + * @param[in] weight a weight for overlap between the closest landmark and * the radius of other landmarks when pruning distances. * Setting this value below 1 can effectively turn off * computing distances against many other balls, enabling @@ -75,11 +84,11 @@ void rbc_build_index(const raft::handle_t& handle, * many datasets can still have great recall even by only * looking in the closest landmark. */ -template +template void rbc_all_knn_query(const raft::handle_t& handle, - BallCoverIndex& index, - value_int k, - value_idx* inds, + BallCoverIndex& index, + int_t k, + idx_t* inds, value_t* dists, bool perform_post_filtering = true, float weight = 1.0) @@ -91,7 +100,7 @@ void rbc_all_knn_query(const raft::handle_t& handle, k, inds, dists, - detail::HaversineFunc(), + detail::HaversineFunc(), perform_post_filtering, weight); } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || @@ -101,7 +110,7 @@ void rbc_all_knn_query(const raft::handle_t& handle, k, inds, dists, - detail::EuclideanFunc(), + detail::EuclideanFunc(), perform_post_filtering, weight); } else { @@ -111,6 +120,58 @@ void rbc_all_knn_query(const raft::handle_t& handle, index.set_index_trained(); } +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * performs an all neighbors knn, which can reuse memory when + * the index and query are the same array. This function will + * build the index and assumes rbc_build_index() has not already + * been called. + * @tparam idx_t knn index type + * @tparam value_t knn distance type + * @tparam int_t type for integers, such as number of rows/cols + * @tparam matrix_idx_t matrix indexing type + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has not yet been built + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param[in] k number of nearest neighbors to find + * @param[in] perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[in] weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + */ +template +void rbc_all_knn_query(const raft::handle_t& handle, + BallCoverIndex& index, + raft::device_matrix_view inds, + raft::device_matrix_view dists, + int_t k, + bool perform_post_filtering = true, + float weight = 1.0) +{ + RAFT_EXPECTS(index.n <= 3, "only 2d and 3d vectors are supported in current implementation"); + RAFT_EXPECTS(k <= index.m, + "k must be less than or equal to the number of data points in the index"); + RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast(k), + "Number of columns in output indices and distances matrices must be equal to k"); + + RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == index.get_X().extent(0), + "Number of rows in output indices and distances matrices must equal number of rows " + "in index matrix."); + + rbc_all_knn_query( + handle, index, k, inds.data_handle(), dists.data_handle(), perform_post_filtering, weight); +} + /** * Performs a faster exact knn in metric spaces using the triangle * inequality with a number of landmark points to reduce the @@ -118,19 +179,19 @@ void rbc_all_knn_query(const raft::handle_t& handle, * function does not build the index and assumes rbc_build_index() has * already been called. Use this function when the index and * query arrays are different, otherwise use rbc_all_knn_query(). - * @tparam value_idx index type + * @tparam idx_t index type * @tparam value_t distances type - * @tparam value_int integer type for size info - * @param handle raft handle for resource management - * @param index ball cover index which has not yet been built - * @param k number of nearest neighbors to find - * @param query the - * @param perform_post_filtering if this is false, only the closest k landmarks + * @tparam int_t integer type for size info + * @param[in] handle raft handle for resource management + * @param[inout] index ball cover index which has not yet been built + * @param[in] k number of nearest neighbors to find + * @param[in] query the + * @param[in] perform_post_filtering if this is false, only the closest k landmarks * are considered (which will return approximate * results). * @param[out] inds output knn indices * @param[out] dists output knn distances - * @param weight a weight for overlap between the closest landmark and + * @param[in] weight a weight for overlap between the closest landmark and * the radius of other landmarks when pruning distances. * Setting this value below 1 can effectively turn off * computing distances against many other balls, enabling @@ -140,13 +201,13 @@ void rbc_all_knn_query(const raft::handle_t& handle, * looking in the closest landmark. * @param[in] n_query_pts number of query points */ -template +template void rbc_knn_query(const raft::handle_t& handle, - BallCoverIndex& index, - value_int k, + const BallCoverIndex& index, + int_t k, const value_t* query, - value_int n_query_pts, - value_idx* inds, + int_t n_query_pts, + idx_t* inds, value_t* dists, bool perform_post_filtering = true, float weight = 1.0) @@ -160,7 +221,7 @@ void rbc_knn_query(const raft::handle_t& handle, n_query_pts, inds, dists, - detail::HaversineFunc(), + detail::HaversineFunc(), perform_post_filtering, weight); } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || @@ -172,7 +233,7 @@ void rbc_knn_query(const raft::handle_t& handle, n_query_pts, inds, dists, - detail::EuclideanFunc(), + detail::EuclideanFunc(), perform_post_filtering, weight); } else { @@ -180,6 +241,68 @@ void rbc_knn_query(const raft::handle_t& handle, } } +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * function does not build the index and assumes rbc_build_index() has + * already been called. Use this function when the index and + * query arrays are different, otherwise use rbc_all_knn_query(). + * @tparam idx_t index type + * @tparam value_t distances type + * @tparam int_t integer type for size info + * @tparam matrix_idx_t + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has not yet been built + * @param[in] query device matrix containing query data points + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param[in] k number of nearest neighbors to find + * @param[in] perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[in] weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + */ +template +void rbc_knn_query(const raft::handle_t& handle, + const BallCoverIndex& index, + raft::device_matrix_view query, + raft::device_matrix_view inds, + raft::device_matrix_view dists, + int_t k, + bool perform_post_filtering = true, + float weight = 1.0) +{ + RAFT_EXPECTS(k <= index.m, + "k must be less than or equal to the number of data points in the index"); + RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast(k), + "Number of columns in output indices and distances matrices must be equal to k"); + + RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == query.extent(0), + "Number of rows in output indices and distances matrices must equal number of rows " + "in search matrix."); + + RAFT_EXPECTS(query.extent(1) == index.get_X().extent(1), + "Number of columns in query and index matrices must match."); + + rbc_knn_query(handle, + index, + k, + query.data_handle(), + query.extent(0), + inds.data_handle(), + dists.data_handle(), + perform_post_filtering, + weight); +} + // TODO: implement functions for: // 4. rbc_eps_neigh() - given a populated index, perform query against different query array // 5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data @@ -188,4 +311,4 @@ void rbc_knn_query(const raft::handle_t& handle, } // namespace spatial } // namespace raft -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h index a2234abf26..9b775bbb82 100644 --- a/cpp/include/raft/spatial/knn/ball_cover_common.h +++ b/cpp/include/raft/spatial/knn/ball_cover_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,90 +16,8 @@ #pragma once -#include -#include -#include -#include +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the ball_cover_types.hpp version instead.") -namespace raft { -namespace spatial { -namespace knn { - -/** - * Stores raw index data points, sampled landmarks, the 1-nns of index points - * to their closest landmarks, and the ball radii of each landmark. This - * class is intended to be constructed once and reused across subsequent - * queries. - * @tparam value_idx - * @tparam value_t - * @tparam value_int - */ -template -class BallCoverIndex { - public: - explicit BallCoverIndex(const raft::handle_t& handle_, - const value_t* X_, - value_int m_, - value_int n_, - raft::distance::DistanceType metric_) - : handle(handle_), - X(X_), - m(m_), - n(n_), - metric(metric_), - /** - * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound - * - * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m) - */ - n_landmarks(sqrt(m_)), - R_indptr(sqrt(m_) + 1, handle.get_stream()), - R_1nn_cols(m_, handle.get_stream()), - R_1nn_dists(m_, handle.get_stream()), - R_closest_landmark_dists(m_, handle.get_stream()), - R(sqrt(m_) * n_, handle.get_stream()), - R_radius(sqrt(m_), handle.get_stream()), - index_trained(false) - { - } - - value_idx* get_R_indptr() { return R_indptr.data(); } - value_idx* get_R_1nn_cols() { return R_1nn_cols.data(); } - value_t* get_R_1nn_dists() { return R_1nn_dists.data(); } - value_t* get_R_radius() { return R_radius.data(); } - value_t* get_R() { return R.data(); } - value_t* get_R_closest_landmark_dists() { return R_closest_landmark_dists.data(); } - const value_t* get_X() { return X; } - - bool is_index_trained() const { return index_trained; }; - - // This should only be set by internal functions - void set_index_trained() { index_trained = true; } - - const raft::handle_t& handle; - - const value_int m; - const value_int n; - const value_int n_landmarks; - - const value_t* X; - - raft::distance::DistanceType metric; - - private: - // CSR storing the neighborhoods for each data point - rmm::device_uvector R_indptr; - rmm::device_uvector R_1nn_cols; - rmm::device_uvector R_1nn_dists; - rmm::device_uvector R_closest_landmark_dists; - - rmm::device_uvector R_radius; - - rmm::device_uvector R; - - protected: - bool index_trained; -}; -} // namespace knn -} // namespace spatial -} // namespace raft +#include diff --git a/cpp/include/raft/spatial/knn/ball_cover_types.hpp b/cpp/include/raft/spatial/knn/ball_cover_types.hpp new file mode 100644 index 0000000000..897bb4df5b --- /dev/null +++ b/cpp/include/raft/spatial/knn/ball_cover_types.hpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace spatial { +namespace knn { + +/** + * Stores raw index data points, sampled landmarks, the 1-nns of index points + * to their closest landmarks, and the ball radii of each landmark. This + * class is intended to be constructed once and reused across subsequent + * queries. + * @tparam value_idx + * @tparam value_t + * @tparam value_int + */ +template +class BallCoverIndex { + public: + explicit BallCoverIndex(const raft::handle_t& handle_, + const value_t* X_, + value_int m_, + value_int n_, + raft::distance::DistanceType metric_) + : handle(handle_), + X(raft::make_device_matrix_view(X_, m_, n_)), + m(m_), + n(n_), + metric(metric_), + /** + * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound + * + * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m) + */ + n_landmarks(sqrt(m_)), + R_indptr(raft::make_device_vector(handle, sqrt(m_) + 1)), + R_1nn_cols(raft::make_device_vector(handle, m_)), + R_1nn_dists(raft::make_device_vector(handle, m_)), + R_closest_landmark_dists(raft::make_device_vector(handle, m_)), + R(raft::make_device_matrix(handle, sqrt(m_), n_)), + R_radius(raft::make_device_vector(handle, sqrt(m_))), + index_trained(false) + { + } + + explicit BallCoverIndex(const raft::handle_t& handle_, + raft::device_matrix_view X_, + raft::distance::DistanceType metric_) + : handle(handle_), + X(X_), + m(X_.extent(0)), + n(X_.extent(1)), + metric(metric_), + /** + * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound + * + * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m) + */ + n_landmarks(sqrt(X_.extent(0))), + R_indptr(raft::make_device_vector(handle, sqrt(X_.extent(0)) + 1)), + R_1nn_cols(raft::make_device_vector(handle, X_.extent(0))), + R_1nn_dists(raft::make_device_vector(handle, X_.extent(0))), + R_closest_landmark_dists(raft::make_device_vector(handle, X_.extent(0))), + R(raft::make_device_matrix(handle, sqrt(X_.extent(0)), X_.extent(1))), + R_radius(raft::make_device_vector(handle, sqrt(X_.extent(0)))), + index_trained(false) + { + } + + auto get_R_indptr() const -> raft::device_vector_view + { + return R_indptr.view(); + } + auto get_R_1nn_cols() const -> raft::device_vector_view + { + return R_1nn_cols.view(); + } + auto get_R_1nn_dists() const -> raft::device_vector_view + { + return R_1nn_dists.view(); + } + auto get_R_radius() const -> raft::device_vector_view + { + return R_radius.view(); + } + auto get_R() const -> raft::device_matrix_view + { + return R.view(); + } + auto get_R_closest_landmark_dists() const -> raft::device_vector_view + { + return R_closest_landmark_dists.view(); + } + + raft::device_vector_view get_R_indptr() { return R_indptr.view(); } + raft::device_vector_view get_R_1nn_cols() { return R_1nn_cols.view(); } + raft::device_vector_view get_R_1nn_dists() { return R_1nn_dists.view(); } + raft::device_vector_view get_R_radius() { return R_radius.view(); } + raft::device_matrix_view get_R() { return R.view(); } + raft::device_vector_view get_R_closest_landmark_dists() + { + return R_closest_landmark_dists.view(); + } + raft::device_matrix_view get_X() const { return X; } + + raft::distance::DistanceType get_metric() const { return metric; } + + value_int get_n_landmarks() const { return n_landmarks; } + bool is_index_trained() const { return index_trained; }; + + // This should only be set by internal functions + void set_index_trained() { index_trained = true; } + + const raft::handle_t& handle; + + value_int m; + value_int n; + value_int n_landmarks; + + raft::device_matrix_view X; + + raft::distance::DistanceType metric; + + private: + // CSR storing the neighborhoods for each data point + raft::device_vector R_indptr; + raft::device_vector R_1nn_cols; + raft::device_vector R_1nn_dists; + raft::device_vector R_closest_landmark_dists; + + raft::device_vector R_radius; + + raft::device_matrix R; + + protected: + bool index_trained; +}; +} // namespace knn +} // namespace spatial +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/brute_force.cuh b/cpp/include/raft/spatial/knn/brute_force.cuh new file mode 100644 index 0000000000..dda1e02eed --- /dev/null +++ b/cpp/include/raft/spatial/knn/brute_force.cuh @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/knn_brute_force_faiss.cuh" +#include "detail/selection_faiss.cuh" +#include + +namespace raft::spatial::knn { + +/** + * @brief Performs a k-select across row partitioned index/distance + * matrices formatted like the following: + * row1: k0, k1, k2 + * row2: k0, k1, k2 + * row3: k0, k1, k2 + * row1: k0, k1, k2 + * row2: k0, k1, k2 + * row3: k0, k1, k2 + * + * etc... + * + * @tparam idx_t + * @tparam value_t + * @param[in] handle + * @param[in] in_keys matrix of input keys (size n_samples * n_parts * k) + * @param[in] in_values matrix of input values (size n_samples * n_parts * k) + * @param[out] out_keys matrix of output keys (size n_samples * k) + * @param[out] out_values matrix of output values (size n_samples * k) + * @param[in] n_samples number of rows in each part + * @param[in] translations optional vector of starting index mappings for each partition + */ +template +inline void knn_merge_parts( + const raft::handle_t& handle, + raft::device_matrix_view in_keys, + raft::device_matrix_view in_values, + raft::device_matrix_view out_keys, + raft::device_matrix_view out_values, + size_t n_samples, + std::optional> translations = std::nullopt) +{ + RAFT_EXPECTS(in_keys.extent(1) == in_values.extent(1) && in_keys.extent(0) == in_values.extent(0), + "in_keys and in_values must have the same shape."); + RAFT_EXPECTS( + out_keys.extent(0) == out_values.extent(0) == n_samples, + "Number of rows in output keys and val matrices must equal number of rows in search matrix."); + RAFT_EXPECTS(out_keys.extent(1) == out_values.extent(1) == in_keys.extent(1), + "Number of columns in output indices and distances matrices must be equal to k"); + + auto n_parts = in_keys.extent(0) / n_samples; + detail::knn_merge_parts(in_keys.data_handle(), + in_values.data_handle(), + out_keys.data_handle(), + out_values.data_handle(), + n_samples, + n_parts, + in_keys.extent(1), + handle.get_stream(), + translations.value_or(nullptr)); +} + +/** + * @brief Flat C++ API function to perform a brute force knn on + * a series of input arrays and combine the results into a single + * output array for indexes and distances. Inputs can be either + * row- or column-major but the output matrices will always be in + * row-major format. + * + * @param[in] handle the cuml handle to use + * @param[in] index vector of device matrices (each size m_i*d) to be used as the knn index + * @param[in] search matrix (size n*d) to be used for searching the index + * @param[out] indices matrix (size n*k) to store output knn indices + * @param[out] distances matrix (size n*k) to store the output knn distance + * @param[in] k the number of nearest neighbors to return + * @param[in] metric distance metric to use. Euclidean (L2) is used by default + * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This + * is ignored if the metric_type is not Minkowski. + * @param[in] translations starting offsets for partitions. should be the same size + * as input vector. + */ +template +void brute_force_knn( + raft::handle_t const& handle, + std::vector> index, + raft::device_matrix_view search, + raft::device_matrix_view indices, + raft::device_matrix_view distances, + value_int k, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded, + std::optional metric_arg = std::make_optional(2.0f), + std::optional> translations = std::nullopt) +{ + RAFT_EXPECTS(index[0].extent(1) == search.extent(1), + "Number of dimensions for both index and search matrices must be equal"); + + RAFT_EXPECTS(indices.extent(0) == distances.extent(0) && distances.extent(0) == search.extent(0), + "Number of rows in output indices and distances matrices must equal number of rows " + "in search matrix."); + RAFT_EXPECTS( + indices.extent(1) == distances.extent(1) && distances.extent(1) == static_cast(k), + "Number of columns in output indices and distances matrices must be equal to k"); + + bool rowMajorIndex = std::is_same_v; + bool rowMajorQuery = std::is_same_v; + + std::vector inputs; + std::vector sizes; + for (std::size_t i = 0; i < index.size(); ++i) { + inputs.push_back(const_cast(index[i].data_handle())); + sizes.push_back(index[i].extent(0)); + } + + std::vector* trans = translations.has_value() ? &(*translations) : nullptr; + + detail::brute_force_knn_impl(handle, + inputs, + sizes, + static_cast(index[0].extent(1)), + // TODO: This is unfortunate. Need to fix. + const_cast(search.data_handle()), + static_cast(search.extent(0)), + indices.data_handle(), + distances.data_handle(), + k, + rowMajorIndex, + rowMajorQuery, + trans, + metric, + metric_arg.value_or(2.0f)); +} + +} // namespace raft::spatial::knn diff --git a/cpp/include/raft/spatial/knn/common.hpp b/cpp/include/raft/spatial/knn/common.hpp index caaa951a66..5c444bf7a7 100644 --- a/cpp/include/raft/spatial/knn/common.hpp +++ b/cpp/include/raft/spatial/knn/common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,35 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +/** + * This file is deprecated and will be removed in a future release. + * Please use the ann_types.hpp version instead. + */ #pragma once -#include - -namespace raft::spatial::knn { - -/** The base for approximate KNN index structures. */ -struct index { -}; - -/** The base for KNN index parameters. */ -struct index_params { - /** Distance type. */ - raft::distance::DistanceType metric = distance::DistanceType::L2Expanded; - /** The argument used by some distance metrics. */ - float metric_arg = 2.0f; - /** - * Whether to add the dataset content to the index, i.e.: - * - * - `true` means the index is filled with the dataset vectors and ready to search after calling - * `build`. - * - `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but - * the index is left empty; you'd need to call `extend` on the index afterwards to populate it. - */ - bool add_data_on_build = true; -}; - -struct search_params { -}; - -}; // namespace raft::spatial::knn +#include diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index 74e1ae75a8..6d3289e14c 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -18,17 +18,23 @@ #include "ann_utils.cuh" +#include +#include + #include #include #include -#include -#include -#include +#include +#include +#include #include +#include #include #include +#include #include +#include #include #include #include @@ -36,73 +42,116 @@ namespace raft::spatial::knn::detail::kmeans { +constexpr static inline const float kAdjustCentersWeight = 7.0f; + /** * @brief Predict labels for the dataset; floats only. * * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows * * n_cluster * sizeof(float)). * + * @tparam IdxT index type + * @tparam LabelT label type + * * @param handle * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim] * @param n_clusters number of clusters/centers * @param dim dimensionality of the data * @param[in] dataset a pointer to the data [n_rows, dim] + * @param[in] dataset_norm pointer to the precomputed norm (for L2 metrics only) [n_rows] * @param n_rows number samples in the `dataset` * @param[out] labels output predictions [n_rows] * @param metric * @param stream * @param mr (optional) memory resource to use for temporary allocations */ -void predict_float_core(const handle_t& handle, - const float* centers, - uint32_t n_clusters, - uint32_t dim, - const float* dataset, - size_t n_rows, - uint32_t* labels, - raft::distance::DistanceType metric, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +template +inline void predict_float_core(const handle_t& handle, + const float* centers, + uint32_t n_clusters, + uint32_t dim, + const float* dataset, + const float* dataset_norm, + IdxT n_rows, + LabelT* labels, + raft::distance::DistanceType metric, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - rmm::device_uvector distances(n_rows * n_clusters, stream, mr); - - float alpha; - float beta; switch (metric) { - case raft::distance::DistanceType::InnerProduct: { - alpha = -1.0; - beta = 0.0; - } break; case raft::distance::DistanceType::L2Expanded: - case raft::distance::DistanceType::L2Unexpanded: { - rmm::device_uvector sqsum_centers(n_clusters, stream, mr); - rmm::device_uvector sqsum_data(n_rows, stream, mr); - utils::dots_along_rows(n_clusters, dim, centers, sqsum_centers.data(), stream); - utils::dots_along_rows(n_rows, dim, dataset, sqsum_data.data(), stream); - utils::outer_add( - sqsum_data.data(), n_rows, sqsum_centers.data(), n_clusters, distances.data(), stream); - alpha = -2.0; - beta = 1.0; - } break; - // NB: update the description of `knn::ivf_flat::build` when adding here a new metric. - default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric)); + case raft::distance::DistanceType::L2SqrtExpanded: { + auto workspace = raft::make_device_mdarray( + handle, mr, make_extents((sizeof(int)) * n_rows)); + + auto minClusterAndDistance = raft::make_device_mdarray, IdxT>( + handle, mr, make_extents(n_rows)); + cub::KeyValuePair initial_value(0, std::numeric_limits::max()); + thrust::fill(handle.get_thrust_policy(), + minClusterAndDistance.data_handle(), + minClusterAndDistance.data_handle() + minClusterAndDistance.size(), + initial_value); + + auto centroidsNorm = + raft::make_device_mdarray(handle, mr, make_extents(n_clusters)); + raft::linalg::rowNorm( + centroidsNorm.data_handle(), centers, dim, n_clusters, raft::linalg::L2Norm, true, stream); + + raft::distance::fusedL2NNMinReduce, IdxT>( + minClusterAndDistance.data_handle(), + dataset, + centers, + dataset_norm, + centroidsNorm.data_handle(), + n_rows, + n_clusters, + dim, + (void*)workspace.data_handle(), + (metric == raft::distance::DistanceType::L2Expanded) ? false : true, + false, + stream); + + // todo(lsugy): use KVP + iterator in caller. + // Copy keys to output labels + thrust::transform(handle.get_thrust_policy(), + minClusterAndDistance.data_handle(), + minClusterAndDistance.data_handle() + n_rows, + labels, + [=] __device__(cub::KeyValuePair kvp) { + return static_cast(kvp.key); + }); + break; + } + case raft::distance::DistanceType::InnerProduct: { + // TODO: pass buffer + rmm::device_uvector distances(n_rows * n_clusters, stream, mr); + + float alpha = -1.0; + float beta = 0.0; + + linalg::gemm(handle, + true, + false, + n_clusters, + n_rows, + dim, + &alpha, + centers, + dim, + dataset, + dim, + &beta, + distances.data(), + n_clusters, + stream); + utils::argmin_along_rows( + n_rows, static_cast(n_clusters), distances.data(), labels, stream); + break; + } + default: { + RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric)); + } } - linalg::gemm(handle, - true, - false, - n_clusters, - n_rows, - dim, - &alpha, - centers, - dim, - dataset, - dim, - &beta, - distances.data(), - n_clusters, - stream); - utils::argmin_along_rows(n_rows, n_clusters, distances.data(), labels, stream); } /** @@ -115,25 +164,48 @@ void predict_float_core(const handle_t& handle, * @param n_rows dataset size * @return a suggested minibatch size */ -constexpr auto calc_minibatch_size(uint32_t n_clusters, size_t n_rows) -> uint32_t +template +constexpr inline auto calc_minibatch_size(uint32_t n_clusters, + IdxT n_rows, + uint32_t dim, + raft::distance::DistanceType metric, + bool is_float) -> IdxT { - n_clusters = std::max(1, n_clusters); - uint32_t minibatch_size = (1 << 20); - if (minibatch_size > (1 << 28) / n_clusters) { - minibatch_size = (1 << 28) / n_clusters; - minibatch_size += 32; - minibatch_size -= minibatch_size % 64; + n_clusters = std::max(1, n_clusters); + + // Estimate memory needs per row (i.e element of the batch). + IdxT mem_per_row = 0; + /* fusedL2NN only needs one integer per row for a mutex. + * Other metrics require storing a distance matrix. */ + if (metric != raft::distance::DistanceType::L2Expanded && + metric != raft::distance::DistanceType::L2SqrtExpanded) { + mem_per_row += sizeof(float) * n_clusters; + } else { + mem_per_row += sizeof(int); } - minibatch_size = uint32_t(std::min(minibatch_size, n_rows)); + // If we need to convert to float, space required for the converted batch. + if (!is_float) { mem_per_row += sizeof(float) * dim; } + + // Heuristic: calculate the minibatch size in order to use at most 1GB of memory. + IdxT minibatch_size = (1 << 30) / mem_per_row; + minibatch_size = 64 * ceildiv(minibatch_size, (IdxT)64); + minibatch_size = std::min(minibatch_size, n_rows); return minibatch_size; } /** * @brief Given the data and labels, calculate cluster centers and sizes in one sweep. * - * Let S_i = {x_k | x_k \in dataset & labels[k] == i} be the vectors in the dataset with label i. - * On exit centers_i = normalize(\sum_{x \in S_i} x), where `normalize` depends on the distance - * type. + * Let `S_i = {x_k | x_k \in dataset & labels[k] == i}` be the vectors in the dataset with label i. + * + * On exit, + * `centers_i = (\sum_{x \in S_i} x + w_i * center_i) / (|S_i| + w_i)`, + * where `w_i = reset_counters ? 0 : cluster_size[i]`. + * + * In other words, the updated cluster centers are a weighted average of the existing cluster + * center, and the coordinates of the points labeled with i. _This allows calling this function + * multiple times with different datasets with the same effect as if calling this function once + * on the combined dataset_. * * NB: `centers` and `cluster_sizes` must be accessible on GPU due to * divide_along_rows/normalize_rows. The rest can be both, under assumption that all pointers are @@ -144,7 +216,9 @@ constexpr auto calc_minibatch_size(uint32_t n_clusters, size_t n_rows) -> uint32 * 1. All pointers are on the device. * 2. All pointers are on the host, but `centers` and `cluster_sizes` are accessible from GPU. * - * @tparam T element type + * @tparam T element type + * @tparam IdxT index type + * @tparam LabelT label type * * @param[inout] centers pointer to the output [n_clusters, dim] * @param[inout] cluster_sizes number of rows in each cluster [n_clusters] @@ -158,14 +232,14 @@ constexpr auto calc_minibatch_size(uint32_t n_clusters, size_t n_rows) -> uint32 * the weighted average principle. * @param stream */ -template +template void calc_centers_and_sizes(float* centers, uint32_t* cluster_sizes, uint32_t n_clusters, uint32_t dim, const T* dataset, - size_t n_rows, - const uint32_t* labels, + IdxT n_rows, + const LabelT* labels, bool reset_counters, rmm::cuda_stream_view stream) { @@ -191,65 +265,162 @@ void calc_centers_and_sizes(float* centers, stream); } +/** Computes the L2 norm of the dataset, converting to float if necessary */ +template +void compute_norm(float* dataset_norm, + const T* dataset, + IdxT dim, + IdxT n_rows, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = nullptr) +{ + common::nvtx::range fun_scope("kmeans::compute_norm"); + if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } + rmm::device_uvector dataset_float(0, stream, mr); + + const float* dataset_ptr = nullptr; + + if (std::is_same_v) { + dataset_ptr = reinterpret_cast(dataset); + } else { + dataset_float.resize(n_rows * dim, stream); + + linalg::unaryOp(dataset_float.data(), dataset, n_rows * dim, utils::mapping{}, stream); + + dataset_ptr = (const float*)dataset_float.data(); + } + + raft::linalg::rowNorm( + dataset_norm, dataset_ptr, dim, n_rows, raft::linalg::L2Norm, true, stream); +} + /** * @brief Predict labels for the dataset. * - * @tparam T element type + * @tparam T element type + * @tparam IdxT index type + * @tparam LabelT label type * * @param handle * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim] * @param n_clusters number of clusters/centers * @param dim dimensionality of the data * @param[in] dataset a pointer to the data [n_rows, dim] + * @param[in] dataset_norm pointer to the precomputed norm (for L2 metrics only) [n_rows] * @param n_rows number samples in the `dataset` * @param[out] labels output predictions [n_rows] * @param metric * @param stream * @param mr (optional) memory resource to use for temporary allocations */ - -template +template void predict(const handle_t& handle, const float* centers, uint32_t n_clusters, uint32_t dim, const T* dataset, - size_t n_rows, - uint32_t* labels, + IdxT n_rows, + LabelT* labels, raft::distance::DistanceType metric, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = nullptr) + rmm::mr::device_memory_resource* mr = nullptr, + const float* dataset_norm = nullptr) { common::nvtx::range fun_scope( - "kmeans::predict(%zu, %u)", n_rows, n_clusters); + "kmeans::predict(%zu, %u)", static_cast(n_rows), n_clusters); if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } - const uint32_t max_minibatch_size = calc_minibatch_size(n_clusters, n_rows); + IdxT max_minibatch_size = + calc_minibatch_size(n_clusters, n_rows, dim, metric, std::is_same_v); rmm::device_uvector cur_dataset( std::is_same_v ? 0 : max_minibatch_size * dim, stream, mr); - auto cur_dataset_ptr = cur_dataset.data(); - for (size_t offset = 0; offset < n_rows; offset += max_minibatch_size) { - auto minibatch_size = std::min(max_minibatch_size, n_rows - offset); + bool need_compute_norm = + dataset_norm == nullptr && (metric == raft::distance::DistanceType::L2Expanded || + metric == raft::distance::DistanceType::L2SqrtExpanded); + rmm::device_uvector cur_dataset_norm( + need_compute_norm ? max_minibatch_size : 0, stream, mr); + const float* dataset_norm_ptr = nullptr; + auto cur_dataset_ptr = cur_dataset.data(); + for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) { + IdxT minibatch_size = std::min(max_minibatch_size, n_rows - offset); if constexpr (std::is_same_v) { cur_dataset_ptr = const_cast(dataset + offset * dim); } else { linalg::unaryOp(cur_dataset_ptr, dataset + offset * dim, - minibatch_size * dim, + (IdxT)(minibatch_size * dim), utils::mapping{}, stream); } - predict_float_core(handle, - centers, - n_clusters, - dim, - cur_dataset_ptr, - minibatch_size, - labels + offset, - metric, - stream, - mr); + // Compute the norm now if it hasn't been pre-computed. + if (need_compute_norm) { + compute_norm( + cur_dataset_norm.data(), cur_dataset_ptr, (IdxT)dim, (IdxT)minibatch_size, stream, mr); + dataset_norm_ptr = cur_dataset_norm.data(); + } else if (dataset_norm != nullptr) { + dataset_norm_ptr = dataset_norm + offset; + } + + predict_float_core(handle, + centers, + n_clusters, + dim, + cur_dataset_ptr, + dataset_norm_ptr, + minibatch_size, + labels + offset, + metric, + stream, + mr); + } +} + +template +__global__ void __launch_bounds__((WarpSize * BlockDimY)) + adjust_centers_kernel(float* centers, // [n_clusters, dim] + uint32_t n_clusters, + uint32_t dim, + const T* dataset, // [n_rows, dim] + IdxT n_rows, + const LabelT* labels, // [n_rows] + const uint32_t* cluster_sizes, // [n_clusters] + float threshold, + uint32_t average, + uint32_t seed, + uint32_t* count) +{ + uint32_t l = threadIdx.y + BlockDimY * blockIdx.y; + if (l >= n_clusters) return; + auto csize = cluster_sizes[l]; + // skip big clusters + if (csize > static_cast(average * threshold)) return; + + // choose a "random" i that belongs to a rather large cluster + IdxT i; + uint32_t j = laneId(); + if (j == 0) { + do { + auto old = static_cast(atomicAdd(count, 1)); + i = (seed * (old + 1)) % n_rows; + } while (cluster_sizes[labels[i]] < average); + } + i = raft::shfl(i, 0); + + // Adjust the center of the selected smaller cluster to gravitate towards + // a sample from the selected larger cluster. + const IdxT li = static_cast(labels[i]); + // Weight of the current center for the weighted average. + // We dump it for anomalously small clusters, but keep constant overwise. + const float wc = csize > kAdjustCentersWeight ? kAdjustCentersWeight : float(csize); + // Weight for the datapoint used to shift the center. + const float wd = 1.0; + for (; j < dim; j += WarpSize) { + float val = 0; + val += wc * centers[j + dim * li]; + val += wd * utils::mapping{}(dataset[j + static_cast(dim) * i]); + val /= wc + wd; + centers[j + dim * l] = val; } } @@ -261,7 +432,7 @@ void predict(const handle_t& handle, * * NB: if this function returns `true`, you should update the labels. * - * NB: all pointers are used on the host side. + * NB: all pointers are used either on the host side or on the device side together. * * @tparam T element type * @@ -275,115 +446,187 @@ void predict(const handle_t& handle, * @param threshold defines a criterion for adjusting a cluster * (cluster_sizes <= average_size * threshold) * 0 <= threshold < 1 + * @param device_memory memory resource to use for temporary allocations * @param stream * * @return whether any of the centers has been updated (and thus, `labels` need to be recalculated). */ -template +template auto adjust_centers(float* centers, uint32_t n_clusters, uint32_t dim, const T* dataset, - size_t n_rows, - const uint32_t* labels, + IdxT n_rows, + const LabelT* labels, const uint32_t* cluster_sizes, float threshold, - rmm::cuda_stream_view stream) -> bool + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* device_memory) -> bool { common::nvtx::range fun_scope( - "kmeans::adjust_centers(%zu, %u)", n_rows, n_clusters); - stream.synchronize(); + "kmeans::adjust_centers(%zu, %u)", static_cast(n_rows), n_clusters); if (n_clusters == 0) { return false; } constexpr static std::array kPrimes{29, 71, 113, 173, 229, 281, 349, 409, 463, 541, 601, 659, 733, 809, 863, 941, 1013, 1069, 1151, 1223, 1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987, 2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741}; - static size_t i = 0; - static size_t i_primes = 0; + static IdxT i = 0; + static IdxT i_primes = 0; bool adjusted = false; - uint32_t average = static_cast(n_rows / size_t(n_clusters)); + uint32_t average = static_cast(n_rows / static_cast(n_clusters)); uint32_t ofst; - do { i_primes = (i_primes + 1) % kPrimes.size(); ofst = kPrimes[i_primes]; } while (n_rows % ofst == 0); - for (uint32_t l = 0; l < n_clusters; l++) { - auto csize = cluster_sizes[l]; - // skip big clusters - if (csize > static_cast(average * threshold)) continue; - // choose a "random" i that belongs to a rather large cluster - do { - i = (i + ofst) % n_rows; - } while (cluster_sizes[labels[i]] < average); - // Adjust the center of the selected smaller cluster to gravitate towards - // a sample from the selected larger cluster. - const size_t li = labels[i]; - // Weight of the current center for the weighted average. - // We dump it for anomalously small clusters, but keep constant overwise. - const float wc = std::min(csize, 7.0); - // Weight for the datapoint used to shift the center. - const float wd = 1.0; - for (uint32_t j = 0; j < dim; j++) { - float val = 0; - val += wc * centers[j + dim * li]; - val += wd * utils::mapping{}(dataset[j + size_t(dim) * i]); - val /= wc + wd; - centers[j + dim * l] = val; - } - adjusted = true; + switch (utils::check_pointer_residency(centers, dataset, labels, cluster_sizes)) { + case utils::pointer_residency::host_and_device: + case utils::pointer_residency::device_only: { + constexpr uint32_t kBlockDimY = 4; + const dim3 block_dim(WarpSize, kBlockDimY, 1); + const dim3 grid_dim(1, raft::ceildiv(n_clusters, kBlockDimY), 1); + rmm::device_scalar update_count(0, stream, device_memory); + adjust_centers_kernel<<>>(centers, + n_clusters, + dim, + dataset, + n_rows, + labels, + cluster_sizes, + threshold, + average, + ofst, + update_count.data()); + adjusted = update_count.value(stream) > 0; // NB: rmm scalar performs the sync + } break; + case utils::pointer_residency::host_only: { + stream.synchronize(); + for (uint32_t l = 0; l < n_clusters; l++) { + auto csize = cluster_sizes[l]; + // skip big clusters + if (csize > static_cast(average * threshold)) continue; + // choose a "random" i that belongs to a rather large cluster + do { + i = (i + ofst) % n_rows; + } while (cluster_sizes[labels[i]] < average); + // Adjust the center of the selected smaller cluster to gravitate towards + // a sample from the selected larger cluster. + const IdxT li = static_cast(labels[i]); + // Weight of the current center for the weighted average. + // We dump it for anomalously small clusters, but keep constant overwise. + const float wc = std::min(csize, kAdjustCentersWeight); + // Weight for the datapoint used to shift the center. + const float wd = 1.0; + for (uint32_t j = 0; j < dim; j++) { + float val = 0; + val += wc * centers[j + dim * li]; + val += wd * utils::mapping{}(dataset[j + static_cast(dim) * i]); + val /= wc + wd; + centers[j + dim * l] = val; + } + adjusted = true; + } + stream.synchronize(); + } break; + default: RAFT_FAIL("All pointers must reside on the same side, host or device."); } - stream.synchronize(); return adjusted; } -/** predict & adjust_centers combined in an iterative process. */ -template -void build_clusters(const handle_t& handle, - uint32_t n_iters, - uint32_t dim, - const T* dataset, // managedl [n_rows, dim] - size_t n_rows, - uint32_t n_clusters, - float* cluster_centers, // managed; [n_clusters, dim] - uint32_t* cluster_labels, // managed; [n_rows] - uint32_t* cluster_sizes, // managed; [n_clusters] - raft::distance::DistanceType metric, - rmm::mr::device_memory_resource* device_memory, - rmm::cuda_stream_view stream) +/** + * @brief Expectation-maximization-balancing combined in an iterative process. + * + * Note, the `cluster_centers` is assumed to be already initialized here. + * Thus, this function can be used for fine-tuning existing clusters; + * to train from scratch, use `build_clusters` function below. + * + * @tparam T element type + * @tparam IdxT index type + * @tparam LabelT label type + * + * @param handle + * @param n_iters the requested number of iteration + * @param dim the dimensionality of the dataset + * @param[in] dataset a pointer to a managed row-major array [n_rows, dim] + * @param[in] dataset_norm pointer to the precomputed norm (for L2 metrics only) [n_rows] + * @param n_rows the number of rows in the dataset + * @param n_cluster the requested number of clusters + * @param[inout] cluster_centers a pointer to a managed row-major array [n_clusters, dim] + * @param[out] cluster_labels a pointer to a managed row-major array [n_rows] + * @param[out] cluster_sizes a pointer to a managed row-major array [n_clusters] + * @param metric the distance type (there is a tweak in place for the similarity-based metrics) + * @param balancing_pullback + * if the cluster centers are rebalanced on this number of iterations, + * one extra iteration is performed (this could happen several times) (default should be `2`). + * In other words, the first and then every `ballancing_pullback`-th rebalancing operation adds + * one more iteration to the main cycle. + * @param balancing_threshold + * the rebalancing takes place if any cluster is smaller than `avg_size * balancing_threshold` + * on a given iteration (default should be `~ 0.25`). + * @param stream + * @param device_memory + * a memory resource for device allocations (makes sense to provide a memory pool here) + */ +template +void balancing_em_iters(const handle_t& handle, + uint32_t n_iters, + uint32_t dim, + const T* dataset, + const float* dataset_norm, + IdxT n_rows, + uint32_t n_clusters, + float* cluster_centers, + LabelT* cluster_labels, + uint32_t* cluster_sizes, + raft::distance::DistanceType metric, + uint32_t balancing_pullback, + float balancing_threshold, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* device_memory) { - // "randomly initialize labels" - auto f = [n_clusters] __device__(uint32_t * out, size_t i) { - *out = uint32_t(i % size_t(n_clusters)); - }; - linalg::writeOnlyUnaryOp(cluster_labels, n_rows, f, stream); - - // update centers to match the initialized labels. - calc_centers_and_sizes( - cluster_centers, cluster_sizes, n_clusters, dim, dataset, n_rows, cluster_labels, true, stream); - - for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) { + uint32_t balancing_counter = balancing_pullback; + for (uint32_t iter = 0; iter < n_iters; iter++) { + // Balancing step - move the centers around to equalize cluster sizes + // (but not on the first iteration) + if (iter > 0 && kmeans::adjust_centers(cluster_centers, + n_clusters, + dim, + dataset, + n_rows, + cluster_labels, + cluster_sizes, + balancing_threshold, + stream, + device_memory)) { + if (balancing_counter++ >= balancing_pullback) { + balancing_counter -= balancing_pullback; + n_iters++; + } + } switch (metric) { // For some metrics, cluster calculation and adjustment tends to favor zero center vectors. // To avoid converging to zero, we normalize the center vectors on every iteration. case raft::distance::DistanceType::InnerProduct: case raft::distance::DistanceType::CosineExpanded: case raft::distance::DistanceType::CorrelationExpanded: - utils::normalize_rows(n_clusters, dim, cluster_centers, stream); + utils::normalize_rows(n_clusters, dim, cluster_centers, stream); default: break; } - predict(handle, - cluster_centers, - n_clusters, - dim, - dataset, - n_rows, - cluster_labels, - metric, - stream, - device_memory); + // E: Expectation step - predict labels + predict(handle, + cluster_centers, + n_clusters, + dim, + dataset, + n_rows, + cluster_labels, + metric, + stream, + device_memory, + dataset_norm); + // M: Maximization step - calculate optimal cluster centers calc_centers_and_sizes(cluster_centers, cluster_sizes, n_clusters, @@ -393,28 +636,63 @@ void build_clusters(const handle_t& handle, cluster_labels, true, stream); - - if (iter + 1 < 2 * n_iters) { - if (kmeans::adjust_centers(cluster_centers, - n_clusters, - dim, - dataset, - n_rows, - cluster_labels, - cluster_sizes, - (float)1.0 / 4, - stream)) { - iter -= 1; - } - } } } +/** Randomly initialize cluster centers and then call `balancing_em_iters`. */ +template +void build_clusters(const handle_t& handle, + uint32_t n_iters, + uint32_t dim, + const T* dataset, + IdxT n_rows, + uint32_t n_clusters, + float* cluster_centers, + LabelT* cluster_labels, + uint32_t* cluster_sizes, + raft::distance::DistanceType metric, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* device_memory, + const float* dataset_norm = nullptr) +{ + RAFT_EXPECTS(static_cast(n_rows) * static_cast(dim) <= + static_cast(std::numeric_limits::max()), + "the chosen index type cannot represent all indices for the given dataset"); + + // "randomly initialize labels" + auto f = [n_clusters] __device__(LabelT * out, IdxT i) { + *out = LabelT(i % static_cast(n_clusters)); + }; + linalg::writeOnlyUnaryOp(cluster_labels, n_rows, f, stream); + + // update centers to match the initialized labels. + calc_centers_and_sizes( + cluster_centers, cluster_sizes, n_clusters, dim, dataset, n_rows, cluster_labels, true, stream); + + // run EM + balancing_em_iters(handle, + n_iters, + dim, + dataset, + dataset_norm, + n_rows, + n_clusters, + cluster_centers, + cluster_labels, + cluster_sizes, + metric, + 2, + 0.25f, + stream, + device_memory); +} + /** Calculate how many fine clusters should belong to each mesocluster. */ -auto arrange_fine_clusters(uint32_t n_clusters, - uint32_t n_mesoclusters, - size_t n_rows, - const uint32_t* mesocluster_sizes) +template +inline auto arrange_fine_clusters(uint32_t n_clusters, + uint32_t n_mesoclusters, + IdxT n_rows, + const uint32_t* mesocluster_sizes) { std::vector fine_clusters_nums(n_mesoclusters); std::vector fine_clusters_csum(n_mesoclusters + 1); @@ -425,8 +703,8 @@ auto arrange_fine_clusters(uint32_t n_clusters, for (uint32_t i = 0; i < n_mesoclusters; i++) { n_nonempty_ms_rem += mesocluster_sizes[i] > 0 ? 1 : 0; } - size_t n_rows_rem = n_rows; - size_t mesocluster_size_sum = 0; + IdxT n_rows_rem = n_rows; + IdxT mesocluster_size_sum = 0; uint32_t mesocluster_size_max = 0; uint32_t fine_clusters_nums_max = 0; for (uint32_t i = 0; i < n_mesoclusters; i++) { @@ -456,14 +734,12 @@ auto arrange_fine_clusters(uint32_t n_clusters, RAFT_EXPECTS(mesocluster_size_sum == n_rows, "mesocluster sizes do not add up (%zu) to the total trainset size (%zu)", - mesocluster_size_sum, - n_rows); + static_cast(mesocluster_size_sum), + static_cast(n_rows)); RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters, "fine cluster numbers do not add up (%u) to the total number of clusters (%u)", fine_clusters_csum[n_mesoclusters], - n_clusters - - ); + n_clusters); return std::make_tuple(mesocluster_size_max, fine_clusters_nums_max, @@ -480,17 +756,18 @@ auto arrange_fine_clusters(uint32_t n_clusters, * 2. Predict fine cluster * 3. Refince the fine cluster centers * - * As a result, the fine clusters are what is returned by `build_optimized_kmeans`; + * As a result, the fine clusters are what is returned by `build_hierarchical`; * this function returns the total number of fine clusters, which can be checked to be * the same as the requested number of clusters. */ -template +template auto build_fine_clusters(const handle_t& handle, uint32_t n_iters, uint32_t dim, const T* dataset_mptr, - const uint32_t* labels_mptr, - size_t n_rows, + const float* dataset_norm_mptr, + const LabelT* labels_mptr, + IdxT n_rows, const uint32_t* fine_clusters_nums, const uint32_t* fine_clusters_csum, const uint32_t* mesocluster_sizes, @@ -499,32 +776,35 @@ auto build_fine_clusters(const handle_t& handle, uint32_t fine_clusters_nums_max, float* cluster_centers, raft::distance::DistanceType metric, - rmm::mr::managed_memory_resource* managed_memory, + rmm::mr::device_memory_resource* managed_memory, rmm::mr::device_memory_resource* device_memory, rmm::cuda_stream_view stream) -> uint32_t { - rmm::device_uvector mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory); - rmm::device_uvector mc_trainset_buf(mesocluster_size_max * dim, stream, managed_memory); - auto mc_trainset_ids = mc_trainset_ids_buf.data(); - auto mc_trainset = mc_trainset_buf.data(); + rmm::device_uvector mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory); + rmm::device_uvector mc_trainset_buf(mesocluster_size_max * dim, stream, device_memory); + rmm::device_uvector mc_trainset_norm_buf(mesocluster_size_max, stream, device_memory); + auto mc_trainset_ids = mc_trainset_ids_buf.data(); + auto mc_trainset = mc_trainset_buf.data(); + auto mc_trainset_norm = mc_trainset_norm_buf.data(); // label (cluster ID) of each vector - rmm::device_uvector mc_trainset_labels(mesocluster_size_max, stream, managed_memory); + rmm::device_uvector mc_trainset_labels(mesocluster_size_max, stream, device_memory); rmm::device_uvector mc_trainset_ccenters( - fine_clusters_nums_max * dim, stream, managed_memory); + fine_clusters_nums_max * dim, stream, device_memory); // number of vectors in each cluster rmm::device_uvector mc_trainset_csizes_tmp( - fine_clusters_nums_max, stream, managed_memory); + fine_clusters_nums_max, stream, device_memory); // Training clusters in each meso-cluster uint32_t n_clusters_done = 0; for (uint32_t i = 0; i < n_mesoclusters; i++) { uint32_t k = 0; - for (size_t j = 0; j < n_rows; j++) { - if (labels_mptr[j] == i) { mc_trainset_ids[k++] = j; } + for (IdxT j = 0; j < n_rows; j++) { + if (labels_mptr[j] == (LabelT)i) { mc_trainset_ids[k++] = j; } } - RAFT_EXPECTS(k == mesocluster_sizes[i], "Incorrect mesocluster size at %d.", i); + if (k != mesocluster_sizes[i]) + RAFT_LOG_WARN("Incorrect mesocluster size at %d. %d vs %d", i, k, mesocluster_sizes[i]); if (k == 0) { RAFT_LOG_DEBUG("Empty cluster %d", i); RAFT_EXPECTS(fine_clusters_nums[i] == 0, @@ -536,21 +816,36 @@ auto build_fine_clusters(const handle_t& handle, "Number of fine clusters must be non-zero for a non-empty mesocluster"); } - utils::copy_selected( - mesocluster_sizes[i], dim, dataset_mptr, mc_trainset_ids, dim, mc_trainset, dim, stream); + utils::copy_selected((IdxT)mesocluster_sizes[i], + (IdxT)dim, + dataset_mptr, + mc_trainset_ids, + (IdxT)dim, + mc_trainset, + (IdxT)dim, + stream); + if (metric == raft::distance::DistanceType::L2Expanded || + metric == raft::distance::DistanceType::L2SqrtExpanded) { + thrust::gather(handle.get_thrust_policy(), + mc_trainset_ids, + mc_trainset_ids + mesocluster_sizes[i], + dataset_norm_mptr, + mc_trainset_norm); + } - build_clusters(handle, - n_iters, - dim, - mc_trainset, - mesocluster_sizes[i], - fine_clusters_nums[i], - mc_trainset_ccenters.data(), - mc_trainset_labels.data(), - mc_trainset_csizes_tmp.data(), - metric, - device_memory, - stream); + build_clusters(handle, + n_iters, + dim, + mc_trainset, + mesocluster_sizes[i], + fine_clusters_nums[i], + mc_trainset_ccenters.data(), + mc_trainset_labels.data(), + mc_trainset_csizes_tmp.data(), + metric, + stream, + device_memory, + mc_trainset_norm); raft::copy(cluster_centers + (dim * fine_clusters_csum[i]), mc_trainset_ccenters.data(), @@ -563,9 +858,11 @@ auto build_fine_clusters(const handle_t& handle, } /** - * kmeans + * @brief Hierarchical balanced k-means * - * @tparam T element type + * @tparam T element type + * @tparam IdxT index type + * @tparam LabelT label type * * @param handle * @param n_iters number of training iterations @@ -574,72 +871,79 @@ auto build_fine_clusters(const handle_t& handle, * @param n_rows number of rows in the input * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim] * @param n_cluster - * @param trainset_fraction a fraction of rows in the `dataset` to sample for kmeans training; - * 0 < trainset_fraction <= 1. - * @param metric the distance metric + * @param metric the distance type * @param stream */ -template -void build_optimized_kmeans(const handle_t& handle, - uint32_t n_iters, - uint32_t dim, - const T* dataset, - size_t n_rows, - float* cluster_centers, - uint32_t n_clusters, - double trainset_fraction, - raft::distance::DistanceType metric, - rmm::cuda_stream_view stream) +template +void build_hierarchical(const handle_t& handle, + uint32_t n_iters, + uint32_t dim, + const T* dataset, + IdxT n_rows, + float* cluster_centers, + uint32_t n_clusters, + raft::distance::DistanceType metric, + rmm::cuda_stream_view stream) { - common::nvtx::range fun_scope( - "kmeans::build_optimized_kmeans(%zu, %u)", n_rows, n_clusters); + using LabelT = uint32_t; - auto trainset_ratio = - std::max(1, n_rows / std::max(trainset_fraction * n_rows, n_clusters)); - auto n_rows_train = n_rows / trainset_ratio; + RAFT_EXPECTS(static_cast(n_rows) * static_cast(dim) <= + static_cast(std::numeric_limits::max()), + "the chosen index type cannot represent all indices for the given dataset"); + + common::nvtx::range fun_scope( + "kmeans::build_hierarchical(%zu, %u)", static_cast(n_rows), n_clusters); uint32_t n_mesoclusters = std::min(n_clusters, std::sqrt(n_clusters) + 0.5); - RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters); + RAFT_LOG_DEBUG("kmeans::build_hierarchical: n_mesoclusters: %u", n_mesoclusters); rmm::mr::managed_memory_resource managed_memory; rmm::mr::device_memory_resource* device_memory = nullptr; - auto pool_guard = raft::get_pool_memory_resource( - device_memory, kmeans::calc_minibatch_size(n_mesoclusters, n_rows_train) * dim * 4); + IdxT max_minibatch_size = + calc_minibatch_size(n_clusters, n_rows, dim, metric, std::is_same_v); + auto pool_guard = raft::get_pool_memory_resource(device_memory, max_minibatch_size * dim * 4); if (pool_guard) { RAFT_LOG_DEBUG( - "kmeans::build_optimized_kmeans: using pool memory resource with initial size %zu bytes", + "kmeans::build_hierarchical: using pool memory resource with initial size %zu bytes", pool_guard->pool_size()); } - rmm::device_uvector trainset(n_rows_train * dim, stream, &managed_memory); - // TODO: a proper sampling - RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), - sizeof(T) * dim, - dataset, - sizeof(T) * dim * trainset_ratio, - sizeof(T) * dim, - n_rows_train, - cudaMemcpyDefault, - stream)); + // Precompute the L2 norm of the dataset if relevant. + const float* dataset_norm = nullptr; + rmm::device_uvector dataset_norm_buf(0, stream, device_memory); + if (metric == raft::distance::DistanceType::L2Expanded || + metric == raft::distance::DistanceType::L2SqrtExpanded) { + dataset_norm_buf.resize(n_rows, stream); + for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) { + IdxT minibatch_size = std::min(max_minibatch_size, n_rows - offset); + compute_norm(dataset_norm_buf.data() + offset, + dataset + dim * offset, + (IdxT)dim, + (IdxT)minibatch_size, + stream, + device_memory); + } + dataset_norm = (const float*)dataset_norm_buf.data(); + } // build coarse clusters (mesoclusters) - rmm::device_uvector mesocluster_labels_buf(n_rows_train, stream, &managed_memory); + rmm::device_uvector mesocluster_labels_buf(n_rows, stream, &managed_memory); rmm::device_uvector mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory); { - rmm::device_uvector mesocluster_centers_buf( - n_mesoclusters * dim, stream, &managed_memory); - build_clusters(handle, - n_iters, - dim, - trainset.data(), - n_rows_train, - n_mesoclusters, - mesocluster_centers_buf.data(), - mesocluster_labels_buf.data(), - mesocluster_sizes_buf.data(), - metric, - device_memory, - stream); + rmm::device_uvector mesocluster_centers_buf(n_mesoclusters * dim, stream, device_memory); + build_clusters(handle, + n_iters, + dim, + dataset, + n_rows, + n_mesoclusters, + mesocluster_centers_buf.data(), + mesocluster_labels_buf.data(), + mesocluster_sizes_buf.data(), + metric, + stream, + device_memory, + dataset_norm); } auto mesocluster_sizes = mesocluster_sizes_buf.data(); @@ -649,58 +953,60 @@ void build_optimized_kmeans(const handle_t& handle, // build fine clusters auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] = - arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows_train, mesocluster_sizes); + arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows, mesocluster_sizes); - if (mesocluster_size_max * n_mesoclusters > 2 * n_rows_train) { - RAFT_LOG_WARN("build_optimized_kmeans: built unbalanced mesoclusters"); + if (mesocluster_size_max * n_mesoclusters > 2 * n_rows) { + RAFT_LOG_WARN("build_hierarchical: built unbalanced mesoclusters"); RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters); RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters); } - auto n_clusters_done = build_fine_clusters(handle, - n_iters, - dim, - trainset.data(), - mesocluster_labels, - n_rows_train, - fine_clusters_nums.data(), - fine_clusters_csum.data(), - mesocluster_sizes, - n_mesoclusters, - mesocluster_size_max, - fine_clusters_nums_max, - cluster_centers, - metric, - &managed_memory, - device_memory, - stream); + auto n_clusters_done = build_fine_clusters(handle, + n_iters, + dim, + dataset, + dataset_norm, + mesocluster_labels, + n_rows, + fine_clusters_nums.data(), + fine_clusters_csum.data(), + mesocluster_sizes, + n_mesoclusters, + mesocluster_size_max, + fine_clusters_nums_max, + cluster_centers, + metric, + &managed_memory, + device_memory, + stream); RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters."); rmm::device_uvector cluster_sizes(n_clusters, stream, device_memory); - rmm::device_uvector labels(n_rows_train, stream, device_memory); - - // fit clusters using the trainset - for (int iter = 0; iter < 2; iter++) { - predict(handle, - cluster_centers, - n_clusters, - dim, - trainset.data(), - n_rows_train, - labels.data(), - metric, - stream, - device_memory); - calc_centers_and_sizes(cluster_centers, - cluster_sizes.data(), - n_clusters, - dim, - trainset.data(), - n_rows_train, - labels.data(), - true, - stream); - } + rmm::device_uvector labels(n_rows, stream, device_memory); + + // Fine-tuning kmeans for all clusters + // + // (*) Since the likely cluster centroids have been calculated + // hierarchically already, the number of iteration for fine-tuning + // kmeans for whole clusters should be reduced. However, there + // is a possibility that the clusters could be unbalanced here, + // in which case the actual number of iterations would be increased. + // + balancing_em_iters(handle, + std::max(n_iters / 10, 2), + dim, + dataset, + dataset_norm, + n_rows, + n_clusters, + cluster_centers, + labels.data(), + cluster_sizes.data(), + metric, + 5, + 0.2f, + stream, + device_memory); } } // namespace raft::spatial::knn::detail::kmeans diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh index 5a56a84fe3..e5900ffd69 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh @@ -22,11 +22,11 @@ #include "common_faiss.h" #include "processing.cuh" -#include -#include +#include +#include #include -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index e789bafde2..8dda574314 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -16,10 +16,10 @@ #pragma once -#include -#include -#include -#include +#include +#include +#include +#include #include @@ -135,8 +135,8 @@ struct mapping { * @param[in] value * @param[in] n_bytes */ -template -inline void memzero(T* ptr, size_t n_elems, rmm::cuda_stream_view stream) +template +inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream) { switch (check_pointer_residency(ptr)) { case pointer_residency::host_and_device: @@ -151,18 +151,16 @@ inline void memzero(T* ptr, size_t n_elems, rmm::cuda_stream_view stream) } } -__global__ void argmin_along_rows_kernel(uint32_t n_rows, - uint32_t n_cols, - const float* a, - uint32_t* out) +template +__global__ void argmin_along_rows_kernel(IdxT n_rows, uint32_t n_cols, const float* a, OutT* out) { - __shared__ uint32_t shm_ids[1024]; // NOLINT - __shared__ float shm_vals[1024]; // NOLINT - uint32_t i = blockIdx.x; + __shared__ OutT shm_ids[1024]; // NOLINT + __shared__ float shm_vals[1024]; // NOLINT + IdxT i = blockIdx.x; if (i >= n_rows) return; - uint32_t min_idx = n_cols; - float min_val = raft::upper_bound(); - for (uint32_t j = threadIdx.x; j < n_cols; j += blockDim.x) { + OutT min_idx = n_cols; + float min_val = raft::upper_bound(); + for (OutT j = threadIdx.x; j < n_cols; j += blockDim.x) { if (min_val > a[j + n_cols * i]) { min_val = a[j + n_cols * i]; min_idx = j; @@ -171,7 +169,7 @@ __global__ void argmin_along_rows_kernel(uint32_t n_rows, shm_vals[threadIdx.x] = min_val; shm_ids[threadIdx.x] = min_idx; __syncthreads(); - for (uint32_t offset = blockDim.x / 2; offset > 0; offset >>= 1) { + for (IdxT offset = blockDim.x / 2; offset > 0; offset >>= 1) { if (threadIdx.x < offset) { if (shm_vals[threadIdx.x] < shm_vals[threadIdx.x + offset]) { } else if (shm_vals[threadIdx.x] > shm_vals[threadIdx.x + offset]) { @@ -192,30 +190,35 @@ __global__ void argmin_along_rows_kernel(uint32_t n_rows, * NB: device-only function * TODO: specialize select_k for the case of `k == 1` and use that one instead. * + * @tparam IdxT index type + * @tparam OutT output type + * * @param n_rows * @param n_cols * @param[in] a device pointer to the row-major matrix [n_rows, n_cols] * @param[out] out device pointer to the vector of selected indices [n_rows] * @param stream */ +template inline void argmin_along_rows( - uint32_t n_rows, uint32_t n_cols, const float* a, uint32_t* out, rmm::cuda_stream_view stream) + IdxT n_rows, IdxT n_cols, const float* a, OutT* out, rmm::cuda_stream_view stream) { - uint32_t block_dim = 1024; + IdxT block_dim = 1024; while (block_dim > n_cols) { block_dim /= 2; } - block_dim = max(block_dim, 128); - argmin_along_rows_kernel<<>>(n_rows, n_cols, a, out); + block_dim = max(block_dim, (IdxT)128); + argmin_along_rows_kernel<<>>(n_rows, n_cols, a, out); } -__global__ void dots_along_rows_kernel(uint32_t n_rows, uint32_t n_cols, const float* a, float* out) +template +__global__ void dots_along_rows_kernel(IdxT n_rows, IdxT n_cols, const float* a, float* out) { - uint64_t i = threadIdx.y + (blockDim.y * blockIdx.x); + IdxT i = threadIdx.y + (blockDim.y * static_cast(blockIdx.x)); if (i >= n_rows) return; float sqsum = 0.0; - for (uint64_t j = threadIdx.x; j < n_cols; j += blockDim.x) { + for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) { float val = a[j + (n_cols * i)]; sqsum += val * val; } @@ -232,18 +235,21 @@ __global__ void dots_along_rows_kernel(uint32_t n_rows, uint32_t n_cols, const f * * NB: device-only function * + * @tparam IdxT index type + * * @param n_rows * @param n_cols * @param[in] a device pointer to the row-major matrix [n_rows, n_cols] * @param[out] out device pointer to the vector of dot-products [n_rows] * @param stream */ +template inline void dots_along_rows( - uint32_t n_rows, uint32_t n_cols, const float* a, float* out, rmm::cuda_stream_view stream) + IdxT n_rows, IdxT n_cols, const float* a, float* out, rmm::cuda_stream_view stream) { dim3 threads(32, 4, 1); - dim3 blocks(ceildiv(n_rows, threads.y), 1, 1); - dots_along_rows_kernel<<>>(n_rows, n_cols, a, out); + dim3 blocks(ceildiv(n_rows, threads.y), 1, 1); + dots_along_rows_kernel<<>>(n_rows, n_cols, a, out); /** * TODO: this can be replaced with the rowNorm helper as shown below. * However, the rowNorm helper seems to incur a significant performance penalty @@ -253,19 +259,19 @@ inline void dots_along_rows( */ } -template -__global__ void accumulate_into_selected_kernel(uint32_t n_rows, +template +__global__ void accumulate_into_selected_kernel(IdxT n_rows, uint32_t n_cols, float* output, uint32_t* selection_counters, const T* input, - const uint32_t* row_ids) + const LabelT* row_ids) { - uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x); - uint64_t j = gid % n_cols; - uint64_t i = gid / n_cols; + IdxT gid = threadIdx.x + (blockDim.x * static_cast(blockIdx.x)); + IdxT j = gid % n_cols; + IdxT i = gid / n_cols; if (i >= n_rows) return; - uint64_t l = row_ids[i]; + IdxT l = static_cast(row_ids[i]); if (j == 0) { atomicAdd(&(selection_counters[l]), 1); } atomicAdd(&(output[j + n_cols * l]), mapping{}(input[gid])); } @@ -275,7 +281,9 @@ __global__ void accumulate_into_selected_kernel(uint32_t n_rows, * (cast and possibly scale the data input type). Count the number of times every output * row was selected along the way. * - * @tparam T + * @tparam T element type + * @tparam IdxT index type + * @tparam LabelT label type * * @param n_cols number of columns in all matrices * @param[out] output output matrix [..., n_cols] @@ -284,13 +292,13 @@ __global__ void accumulate_into_selected_kernel(uint32_t n_rows, * @param[in] input row-major input matrix [n_rows, n_cols] * @param[in] row_ids row indices in the output matrix [n_rows] */ -template -void accumulate_into_selected(size_t n_rows, +template +void accumulate_into_selected(IdxT n_rows, uint32_t n_cols, float* output, uint32_t* selection_counters, const T* input, - const uint32_t* row_ids, + const LabelT* row_ids, rmm::cuda_stream_view stream) { switch (check_pointer_residency(output, input, selection_counters, row_ids)) { @@ -298,16 +306,16 @@ void accumulate_into_selected(size_t n_rows, case pointer_residency::device_only: { uint32_t block_dim = 128; auto grid_dim = - static_cast(ceildiv(n_rows * static_cast(n_cols), block_dim)); + static_cast(ceildiv(n_rows * static_cast(n_cols), block_dim)); accumulate_into_selected_kernel<<>>( n_rows, n_cols, output, selection_counters, input, row_ids); } break; case pointer_residency::host_only: { stream.synchronize(); - for (size_t i = 0; i < n_rows; i++) { - uint32_t l = row_ids[i]; + for (IdxT i = 0; i < n_rows; i++) { + IdxT l = static_cast(row_ids[i]); selection_counters[l]++; - for (uint32_t j = 0; j < n_cols; j++) { + for (IdxT j = 0; j < n_cols; j++) { output[j + n_cols * l] += mapping{}(input[j + n_cols * i]); } } @@ -317,13 +325,14 @@ void accumulate_into_selected(size_t n_rows, } } -__global__ void normalize_rows_kernel(uint32_t n_rows, uint32_t n_cols, float* a) +template +__global__ void normalize_rows_kernel(IdxT n_rows, IdxT n_cols, float* a) { - uint64_t i = threadIdx.y + (blockDim.y * blockIdx.x); + IdxT i = threadIdx.y + (blockDim.y * static_cast(blockIdx.x)); if (i >= n_rows) return; float sqsum = 0.0; - for (uint32_t j = threadIdx.x; j < n_cols; j += blockDim.x) { + for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) { float val = a[j + (n_cols * i)]; sqsum += val * val; } @@ -334,7 +343,7 @@ __global__ void normalize_rows_kernel(uint32_t n_rows, uint32_t n_cols, float* a sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16); if (sqsum <= 1e-8) return; sqsum = rsqrtf(sqsum); // reciprocal of the square root - for (uint32_t j = threadIdx.x; j < n_cols; j += blockDim.x) { + for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) { a[j + n_cols * i] *= sqsum; } } @@ -344,65 +353,66 @@ __global__ void normalize_rows_kernel(uint32_t n_rows, uint32_t n_cols, float* a * * NB: device-only function * + * @tparam IdxT index type + * * @param[in] n_rows * @param[in] n_cols * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols] * @param stream */ -inline void normalize_rows(uint32_t n_rows, uint32_t n_cols, float* a, rmm::cuda_stream_view stream) +template +inline void normalize_rows(IdxT n_rows, IdxT n_cols, float* a, rmm::cuda_stream_view stream) { dim3 threads(32, 4, 1); // DO NOT CHANGE dim3 blocks(ceildiv(n_rows, threads.y), 1, 1); - normalize_rows_kernel<<>>(n_rows, n_cols, a); + normalize_rows_kernel<<>>(n_rows, n_cols, a); } -template +template __global__ void map_along_rows_kernel( - uint32_t n_rows, uint32_t n_cols, float* a, const uint32_t* d, Lambda map) + IdxT n_rows, uint32_t n_cols, float* a, const uint32_t* d, Lambda map) { - uint64_t gid = threadIdx.x + blockDim.x * blockIdx.x; - uint64_t i = gid / n_cols; + IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); + IdxT i = gid / n_cols; if (i >= n_rows) return; float& x = a[gid]; x = map(x, d[i]); } /** - * @brief Divide matrix values along rows by an integer value, skipping rows if the corresponding - * divisor is zero. + * @brief Map a binary function over a matrix and a vector element-wise, broadcasting the vector + * values along rows: `m[i, j] = op(m[i,j], v[i])` * * NB: device-only function * + * @tparam IdxT index type * @tparam Lambda * * @param n_rows * @param n_cols - * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols] - * @param[in] d device pointer to a vector [n_rows] - * @param map the binary operation to apply on every element of matrix rows and of the vector + * @param[inout] m device pointer to a row-major matrix [n_rows, n_cols] + * @param[in] v device pointer to a vector [n_rows] + * @param op the binary operation to apply on every element of matrix rows and of the vector */ -template -inline void map_along_rows(uint32_t n_rows, +template +inline void map_along_rows(IdxT n_rows, uint32_t n_cols, - float* a, - const uint32_t* d, - Lambda map, + float* m, + const uint32_t* v, + Lambda op, rmm::cuda_stream_view stream) { dim3 threads(128, 1, 1); - dim3 blocks( - ceildiv(static_cast(n_rows) * static_cast(n_cols), threads.x), - 1, - 1); - map_along_rows_kernel<<>>(n_rows, n_cols, a, d, map); + dim3 blocks(ceildiv(n_rows * n_cols, threads.x), 1, 1); + map_along_rows_kernel<<>>(n_rows, n_cols, m, v, op); } -template -__global__ void outer_add_kernel(const T* a, uint32_t len_a, const T* b, uint32_t len_b, T* c) +template +__global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c) { - uint64_t gid = threadIdx.x + blockDim.x * blockIdx.x; - uint64_t i = gid / len_b; - uint64_t j = gid % len_b; + IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); + IdxT i = gid / len_b; + IdxT j = gid % len_b; if (i >= len_a) return; c[gid] = (a == nullptr ? T(0) : a[i]) + (b == nullptr ? T(0) : b[j]); } @@ -415,7 +425,7 @@ __global__ void block_copy_kernel(const IdxT* in_offsets, T* out_data, IdxT n_mult) { - IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x; + IdxT i = static_cast(blockDim.x) * static_cast(blockIdx.x) + threadIdx.x; // find the source offset using the binary search. uint32_t l = 0; uint32_t r = n_blocks; @@ -472,7 +482,8 @@ void block_copy(const IdxT* in_offsets, * * NB: device-only function * - * @tparam T element type + * @tparam T element type + * @tparam IdxT index type * * @param[in] a device pointer to a vector [len_a] * @param len_a number of elements in `a` @@ -481,32 +492,23 @@ void block_copy(const IdxT* in_offsets, * @param[out] c row-major matrix [len_a, len_b] * @param stream */ -template -void outer_add( - const T* a, uint32_t len_a, const T* b, uint32_t len_b, T* c, rmm::cuda_stream_view stream) +template +void outer_add(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c, rmm::cuda_stream_view stream) { dim3 threads(128, 1, 1); - dim3 blocks( - ceildiv(static_cast(len_a) * static_cast(len_b), threads.x), - 1, - 1); + dim3 blocks(ceildiv(len_a * len_b, threads.x), 1, 1); outer_add_kernel<<>>(a, len_a, b, len_b, c); } -template -__global__ void copy_selected_kernel(uint32_t n_rows, - uint32_t n_cols, - const S* src, - const uint32_t* row_ids, - uint32_t ld_src, - T* dst, - uint32_t ld_dst) +template +__global__ void copy_selected_kernel( + IdxT n_rows, IdxT n_cols, const S* src, const LabelT* row_ids, IdxT ld_src, T* dst, IdxT ld_dst) { - uint64_t gid = threadIdx.x + blockDim.x * blockIdx.x; - uint64_t j = gid % n_cols; - uint64_t i_dst = gid / n_cols; + IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); + IdxT j = gid % n_cols; + IdxT i_dst = gid / n_cols; if (i_dst >= n_rows) return; - uint64_t i_src = row_ids[i_dst]; + auto i_src = static_cast(row_ids[i_dst]); dst[ld_dst * i_dst + j] = mapping{}(src[ld_src * i_src + j]); } @@ -514,8 +516,10 @@ __global__ void copy_selected_kernel(uint32_t n_rows, * @brief Copy selected rows of a matrix while mapping the data from the source to the target * type. * - * @tparam T target type - * @tparam S source type + * @tparam T target type + * @tparam S source type + * @tparam IdxT index type + * @tparam LabelT label type * * @param n_rows * @param n_cols @@ -526,29 +530,29 @@ __global__ void copy_selected_kernel(uint32_t n_rows, * @param ld_dst number of cols in the output (ld_dst >= n_cols) * @param stream */ -template -void copy_selected(uint32_t n_rows, - uint32_t n_cols, +template +void copy_selected(IdxT n_rows, + IdxT n_cols, const S* src, - const uint32_t* row_ids, - uint32_t ld_src, + const LabelT* row_ids, + IdxT ld_src, T* dst, - uint32_t ld_dst, + IdxT ld_dst, rmm::cuda_stream_view stream) { - switch (check_pointer_residency(src, dst)) { + switch (check_pointer_residency(src, dst, row_ids)) { case pointer_residency::host_and_device: case pointer_residency::device_only: { - uint32_t block_dim = 128; - uint32_t grid_dim = ceildiv(n_rows * n_cols, block_dim); + IdxT block_dim = 128; + IdxT grid_dim = ceildiv(n_rows * n_cols, block_dim); copy_selected_kernel <<>>(n_rows, n_cols, src, row_ids, ld_src, dst, ld_dst); } break; case pointer_residency::host_only: { stream.synchronize(); - for (uint64_t i_dst = 0; i_dst < n_rows; i_dst++) { - uint64_t i_src = row_ids[i_dst]; - for (uint64_t j = 0; j < n_cols; j++) { + for (IdxT i_dst = 0; i_dst < n_rows; i_dst++) { + auto i_src = static_cast(row_ids[i_dst]); + for (IdxT j = 0; j < n_cols; j++) { dst[ld_dst * i_dst + j] = mapping{}(src[ld_src * i_src + j]); } } @@ -557,4 +561,5 @@ void copy_selected(uint32_t n_rows, default: RAFT_FAIL("All pointers must reside on the same side, host or device."); } } + } // namespace raft::spatial::knn::detail::utils diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh index 2f7c76a11d..94897daa22 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include #include "../ball_cover_common.h" #include "ball_cover/common.cuh" @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include @@ -75,8 +75,8 @@ void sample_landmarks(const raft::handle_t& handle, rmm::device_uvector R_indices(index.n_landmarks, handle.get_stream()); thrust::sequence(handle.get_thrust_policy(), - index.get_R_1nn_cols(), - index.get_R_1nn_cols() + index.m, + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_cols().data_handle() + index.m, (value_idx)0); thrust::fill( @@ -93,15 +93,15 @@ void sample_landmarks(const raft::handle_t& handle, rng_state, R_indices.data(), R_1nn_cols2.data(), - index.get_R_1nn_cols(), + index.get_R_1nn_cols().data_handle(), R_1nn_ones.data(), (value_idx)index.n_landmarks, (value_idx)index.m); - raft::matrix::copyRows(index.get_X(), + raft::matrix::copyRows(index.get_X().data_handle(), index.m, index.n, - index.get_R(), + index.get_R().data_handle(), R_1nn_cols2.data(), index.n_landmarks, handle.get_stream(), @@ -133,7 +133,7 @@ void construct_landmark_1nn(const raft::handle_t& handle, std::numeric_limits::max()); value_idx* R_1nn_inds_ptr = R_1nn_inds.data(); - value_t* R_1nn_dists_ptr = index.get_R_1nn_dists(); + value_t* R_1nn_dists_ptr = index.get_R_1nn_dists().data_handle(); auto idxs = thrust::make_counting_iterator(0); thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m, [=] __device__(value_idx i) { @@ -141,16 +141,22 @@ void construct_landmark_1nn(const raft::handle_t& handle, R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k]; }); - auto keys = - thrust::make_zip_iterator(thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists())); + auto keys = thrust::make_zip_iterator( + thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists().data_handle())); // group neighborhoods for each reference landmark and sort each group by distance - thrust::sort_by_key( - handle.get_thrust_policy(), keys, keys + index.m, index.get_R_1nn_cols(), NNComp()); + thrust::sort_by_key(handle.get_thrust_policy(), + keys, + keys + index.m, + index.get_R_1nn_cols().data_handle(), + NNComp()); // convert to CSR for fast lookup - raft::sparse::convert::sorted_coo_to_csr( - R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1, handle.get_stream()); + raft::sparse::convert::sorted_coo_to_csr(R_1nn_inds.data(), + index.m, + index.get_R_indptr().data_handle(), + index.n_landmarks + 1, + handle.get_stream()); } /** @@ -168,15 +174,16 @@ void construct_landmark_1nn(const raft::handle_t& handle, */ template void k_closest_landmarks(const raft::handle_t& handle, - BallCoverIndex& index, + const BallCoverIndex& index, const value_t* query_pts, value_int n_query_pts, value_int k, value_idx* R_knn_inds, value_t* R_knn_dists) { - std::vector input = {index.get_R()}; - std::vector sizes = {index.n_landmarks}; + // TODO: Add const to the brute-force knn inputs + std::vector input = {const_cast(index.get_R().data_handle())}; + std::vector sizes = {index.n_landmarks}; brute_force_knn_impl(handle, input, @@ -190,7 +197,7 @@ void k_closest_landmarks(const raft::handle_t& handle, true, true, nullptr, - index.metric); + index.get_metric()); } /** @@ -207,9 +214,9 @@ void compute_landmark_radii(const raft::handle_t& handle, { auto entries = thrust::make_counting_iterator(0); - const value_idx* R_indptr_ptr = index.get_R_indptr(); - const value_t* R_1nn_dists_ptr = index.get_R_1nn_dists(); - value_t* R_radius_ptr = index.get_R_radius(); + const value_idx* R_indptr_ptr = index.get_R_indptr().data_handle(); + const value_t* R_1nn_dists_ptr = index.get_R_1nn_dists().data_handle(); + value_t* R_radius_ptr = index.get_R_radius().data_handle(); thrust::for_each(handle.get_thrust_policy(), entries, entries + index.n_landmarks, @@ -234,7 +241,7 @@ template void perform_rbc_query(const raft::handle_t& handle, - BallCoverIndex& index, + const BallCoverIndex& index, const value_t* query, value_int n_query_pts, std::uint32_t k, @@ -350,8 +357,8 @@ void rbc_build_index(const raft::handle_t& handle, R_knn_inds.end(), std::numeric_limits::max()); thrust::fill(handle.get_thrust_policy(), - index.get_R_closest_landmark_dists(), - index.get_R_closest_landmark_dists() + index.m, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_closest_landmark_dists().data_handle() + index.m, std::numeric_limits::max()); /** @@ -365,11 +372,11 @@ void rbc_build_index(const raft::handle_t& handle, value_int k = 1; k_closest_landmarks(handle, index, - index.get_X(), + index.get_X().data_handle(), index.m, k, R_knn_inds.data(), - index.get_R_closest_landmark_dists()); + index.get_R_closest_landmark_dists().data_handle()); /** * 3. Create L_r = knn[:,0].T (CSR) @@ -377,7 +384,8 @@ void rbc_build_index(const raft::handle_t& handle, * Slice closest neighboring R * Secondary sort by (R_knn_inds, R_knn_dists) */ - construct_landmark_1nn(handle, R_knn_inds.data(), index.get_R_closest_landmark_dists(), k, index); + construct_landmark_1nn( + handle, R_knn_inds.data(), index.get_R_closest_landmark_dists().data_handle(), k, index); /** * Compute radius of each R for filtering: p(q, r) <= p(q, q_r) + radius(r) @@ -432,7 +440,7 @@ void rbc_all_knn_query(const raft::handle_t& handle, sample_landmarks(handle, index); k_closest_landmarks( - handle, index, index.get_X(), index.m, k, R_knn_inds.data(), R_knn_dists.data()); + handle, index, index.get_X().data_handle(), index.m, k, R_knn_inds.data(), R_knn_dists.data()); construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index); @@ -440,7 +448,7 @@ void rbc_all_knn_query(const raft::handle_t& handle, perform_rbc_query(handle, index, - index.get_X(), + index.get_X().data_handle(), index.m, k, R_knn_inds.data(), @@ -463,7 +471,7 @@ template void rbc_knn_query(const raft::handle_t& handle, - BallCoverIndex& index, + const BallCoverIndex& index, value_int k, const value_t* query, value_int n_query_pts, diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh index 32f55c7931..112ab9f13c 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include @@ -331,7 +331,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index, value_idx* out_inds, value_t* out_dists, value_int* dist_counter, - value_t* R_radius, + const value_t* R_radius, distance_func dfunc, float weight = 1.0) { @@ -472,7 +472,7 @@ template void rbc_low_dim_pass_one(const raft::handle_t& handle, - BallCoverIndex& index, + const BallCoverIndex& index, const value_t* query, const value_int n_query_rows, value_int k, @@ -486,114 +486,114 @@ void rbc_low_dim_pass_one(const raft::handle_t& handle, { if (k <= 32) block_rbc_kernel_registers - <<>>(index.get_X(), + <<>>(index.get_X().data_handle(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), inds, dists, dists_counter, - index.get_R_radius(), + index.get_R_radius().data_handle(), dfunc, weight); else if (k <= 64) block_rbc_kernel_registers - <<>>(index.get_X(), + <<>>(index.get_X().data_handle(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), inds, dists, dists_counter, - index.get_R_radius(), + index.get_R_radius().data_handle(), dfunc, weight); else if (k <= 128) block_rbc_kernel_registers - <<>>(index.get_X(), + <<>>(index.get_X().data_handle(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), inds, dists, dists_counter, - index.get_R_radius(), + index.get_R_radius().data_handle(), dfunc, weight); else if (k <= 256) block_rbc_kernel_registers - <<>>(index.get_X(), + <<>>(index.get_X().data_handle(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), inds, dists, dists_counter, - index.get_R_radius(), + index.get_R_radius().data_handle(), dfunc, weight); else if (k <= 512) block_rbc_kernel_registers - <<>>(index.get_X(), + <<>>(index.get_X().data_handle(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), inds, dists, dists_counter, - index.get_R_radius(), + index.get_R_radius().data_handle(), dfunc, weight); else if (k <= 1024) block_rbc_kernel_registers - <<>>(index.get_X(), + <<>>(index.get_X().data_handle(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), inds, dists, dists_counter, - index.get_R_radius(), + index.get_R_radius().data_handle(), dfunc, weight); } @@ -604,7 +604,7 @@ template void rbc_low_dim_pass_two(const raft::handle_t& handle, - BallCoverIndex& index, + const BallCoverIndex& index, const value_t* query, const value_int n_query_rows, value_int k, @@ -627,8 +627,8 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle, index.n, R_knn_inds, R_knn_dists, - index.get_R_radius(), - index.get_R(), + index.get_R_radius().data_handle(), + index.get_R().data_handle(), index.n_landmarks, bitset_size, k, @@ -645,22 +645,22 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle, 32, 2, 128, - dims> - <<>>(index.get_X(), - query, - index.n, - bitset.data(), - bitset_size, - index.get_R_closest_landmark_dists(), - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), - inds, - dists, - index.n_landmarks, - k, - dfunc, - post_dists_counter); + dims><<>>( + index.get_X().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 64) compute_final_dists_registers - <<>>(index.get_X(), - query, - index.n, - bitset.data(), - bitset_size, - index.get_R_closest_landmark_dists(), - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), - inds, - dists, - index.n_landmarks, - k, - dfunc, - post_dists_counter); + dims><<>>( + index.get_X().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 128) compute_final_dists_registers - <<>>(index.get_X(), - query, - index.n, - bitset.data(), - bitset_size, - index.get_R_closest_landmark_dists(), - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), - inds, - dists, - index.n_landmarks, - k, - dfunc, - post_dists_counter); + dims><<>>( + index.get_X().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 256) compute_final_dists_registers - <<>>(index.get_X(), - query, - index.n, - bitset.data(), - bitset_size, - index.get_R_closest_landmark_dists(), - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), - inds, - dists, - index.n_landmarks, - k, - dfunc, - post_dists_counter); + dims><<>>( + index.get_X().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 512) compute_final_dists_registers - <<>>(index.get_X(), - query, - index.n, - bitset.data(), - bitset_size, - index.get_R_closest_landmark_dists(), - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), - inds, - dists, - index.n_landmarks, - k, - dfunc, - post_dists_counter); + dims><<>>( + index.get_X().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 1024) compute_final_dists_registers - <<>>(index.get_X(), - query, - index.n, - bitset.data(), - bitset_size, - index.get_R_closest_landmark_dists(), - index.get_R_indptr(), - index.get_R_1nn_cols(), - index.get_R_1nn_dists(), - inds, - dists, - index.n_landmarks, - k, - dfunc, - post_dists_counter); + dims><<>>( + index.get_X().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); } }; // namespace detail diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h index aca1571de2..b098d0991d 100644 --- a/cpp/include/raft/spatial/knn/detail/common_faiss.h +++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h @@ -16,11 +16,11 @@ #pragma once -#include -#include +#include +#include #include -#include +#include namespace raft { namespace spatial { diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 5d703bdb8d..b5ae9e7d5e 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -16,8 +16,8 @@ #pragma once -#include -#include +#include +#include #include #include @@ -25,8 +25,8 @@ #include #include -#include -#include +#include +#include #include namespace raft { diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh index 96af5c9522..af1cb97d36 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include @@ -44,8 +44,9 @@ using namespace raft::spatial::knn::detail; // NOLINT * X dimension must cover the dataset (n_rows), YZ are not used; * there are no dependencies between threads, hence no constraints on the block size. * - * @tparam T the element type. - * @tparam IdxT type of the indices in the source source_vecs + * @tparam T element type. + * @tparam IdxT type of the indices in the source source_vecs + * @tparam LabelT label type * * @param[in] labels device pointer to the cluster ids for each row [n_rows] * @param[in] list_offsets device pointer to the cluster offsets in the output (index) [n_lists] @@ -60,8 +61,8 @@ using namespace raft::spatial::knn::detail; // NOLINT * @param veclen size of vectorized loads/stores; must satisfy `dim % veclen == 0`. * */ -template -__global__ void build_index_kernel(const uint32_t* labels, +template +__global__ void build_index_kernel(const LabelT* labels, const IdxT* list_offsets, const T* source_vecs, const IdxT* source_ixs, @@ -108,39 +109,40 @@ inline auto extend(const handle_t& handle, const index& orig_index, const T* new_vectors, const IdxT* new_indices, - IdxT n_rows, - rmm::cuda_stream_view stream) -> index + IdxT n_rows) -> index { - auto n_lists = orig_index.n_lists; - auto dim = orig_index.dim; + using LabelT = uint32_t; + + auto stream = handle.get_stream(); + auto n_lists = orig_index.n_lists(); + auto dim = orig_index.dim(); common::nvtx::range fun_scope( "ivf_flat::extend(%zu, %u)", size_t(n_rows), dim); - RAFT_EXPECTS(new_indices != nullptr || orig_index.size == 0, + RAFT_EXPECTS(new_indices != nullptr || orig_index.size() == 0, "You must pass data indices when the index is non-empty."); - rmm::device_uvector new_labels(n_rows, stream); - kmeans::predict(handle, - orig_index.centers.data(), - n_lists, - dim, - new_vectors, - n_rows, - new_labels.data(), - orig_index.metric, - stream); - - auto&& list_sizes = rmm::device_uvector(n_lists, stream); - auto&& list_offsets = rmm::device_uvector(n_lists + 1, stream); - auto list_sizes_ptr = list_sizes.data(); - auto list_offsets_ptr = list_offsets.data(); - - auto&& centers = rmm::device_uvector(size_t(n_lists) * size_t(dim), stream); - auto centers_ptr = centers.data(); + rmm::device_uvector new_labels(n_rows, stream); + kmeans::predict(handle, + orig_index.centers().data_handle(), + n_lists, + dim, + new_vectors, + n_rows, + new_labels.data(), + orig_index.metric(), + stream); + + index ext_index(handle, orig_index.metric(), n_lists, dim); + + auto list_sizes_ptr = ext_index.list_sizes().data_handle(); + auto list_offsets_ptr = ext_index.list_offsets().data_handle(); + auto centers_ptr = ext_index.centers().data_handle(); // Calculate the centers and sizes on the new data, starting from the original values - raft::copy(centers_ptr, orig_index.centers.data(), centers.size(), stream); - raft::copy(list_sizes_ptr, orig_index.list_sizes.data(), list_sizes.size(), stream); + raft::copy(centers_ptr, orig_index.centers().data_handle(), ext_index.centers().size(), stream); + raft::copy( + list_sizes_ptr, orig_index.list_sizes().data_handle(), ext_index.list_sizes().size(), stream); kmeans::calc_centers_and_sizes(centers_ptr, list_sizes_ptr, @@ -160,35 +162,36 @@ inline auto extend(const handle_t& handle, list_sizes_ptr, list_sizes_ptr + n_lists, list_offsets_ptr + 1, - [] __device__(IdxT s, uint32_t l) { return s + Pow2::roundUp(l); }); + [] __device__(IdxT s, uint32_t l) { return s + Pow2::roundUp(l); }); update_host(&index_size, list_offsets_ptr + n_lists, 1, stream); handle.sync_stream(stream); - auto&& data = rmm::device_uvector(index_size * IdxT(dim), stream); - auto&& indices = rmm::device_uvector(index_size, stream); + ext_index.allocate( + handle, index_size, ext_index.metric() == raft::distance::DistanceType::L2Expanded); // Populate index with the old data - if (orig_index.size > 0) { - utils::block_copy(orig_index.list_offsets.data(), + if (orig_index.size() > 0) { + utils::block_copy(orig_index.list_offsets().data_handle(), list_offsets_ptr, IdxT(n_lists), - orig_index.data.data(), - data.data(), + orig_index.data().data_handle(), + ext_index.data().data_handle(), IdxT(dim), stream); - utils::block_copy(orig_index.list_offsets.data(), + utils::block_copy(orig_index.list_offsets().data_handle(), list_offsets_ptr, IdxT(n_lists), - orig_index.indices.data(), - indices.data(), + orig_index.indices().data_handle(), + ext_index.indices().data_handle(), IdxT(1), stream); } // Copy the old sizes, so we can start from the current state of the index; // we'll rebuild the `list_sizes_ptr` in the following kernel, using it as an atomic counter. - raft::copy(list_sizes_ptr, orig_index.list_sizes.data(), list_sizes.size(), stream); + raft::copy( + list_sizes_ptr, orig_index.list_sizes().data_handle(), ext_index.list_sizes().size(), stream); const dim3 block_dim(256); const dim3 grid_dim(raft::ceildiv(n_rows, block_dim.x)); @@ -196,110 +199,75 @@ inline auto extend(const handle_t& handle, list_offsets_ptr, new_vectors, new_indices, - data.data(), - indices.data(), + ext_index.data().data_handle(), + ext_index.indices().data_handle(), list_sizes_ptr, n_rows, dim, - orig_index.veclen); + ext_index.veclen()); RAFT_CUDA_TRY(cudaPeekAtLastError()); // Precompute the centers vector norms for L2Expanded distance - auto compute_norms = [&]() { - auto&& r = rmm::device_uvector(n_lists, stream); - utils::dots_along_rows(n_lists, dim, centers.data(), r.data(), stream); - RAFT_LOG_TRACE_VEC(r.data(), 20); - return std::move(r); - }; - auto&& center_norms = orig_index.metric == raft::distance::DistanceType::L2Expanded - ? std::optional(compute_norms()) - : std::nullopt; + if (ext_index.center_norms().has_value()) { + // todo(lsugy): use other prim and remove this one + utils::dots_along_rows(n_lists, + dim, + ext_index.centers().data_handle(), + ext_index.center_norms()->data_handle(), + stream); + RAFT_LOG_TRACE_VEC(ext_index.center_norms()->data_handle(), std::min(dim, 20)); + } // assemble the index - index new_index{{}, - orig_index.veclen, - orig_index.metric, - index_size, - orig_index.dim, - orig_index.n_lists, - std::move(data), - std::move(indices), - std::move(list_sizes), - std::move(list_offsets), - std::move(centers), - std::move(center_norms)}; - - // check index invariants - new_index.check_consistency(); - - return new_index; + return ext_index; } /** See raft::spatial::knn::ivf_flat::build docs */ template -inline auto build(const handle_t& handle, - const index_params& params, - const T* dataset, - IdxT n_rows, - uint32_t dim, - rmm::cuda_stream_view stream) -> index +inline auto build( + const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim) + -> index { + auto stream = handle.get_stream(); common::nvtx::range fun_scope( "ivf_flat::build(%zu, %u)", size_t(n_rows), dim); static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "unsupported data type"); RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset"); - // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a - // template parameter (https://github.com/rapidsai/raft/issues/711) - uint32_t veclen = 16 / sizeof(T); - while (dim % veclen != 0) { - veclen = veclen >> 1; + index index(handle, params, dim); + utils::memzero(index.list_sizes().data_handle(), index.list_sizes().size(), stream); + utils::memzero(index.list_offsets().data_handle(), index.list_offsets().size(), stream); + + // Train the kmeans clustering + { + auto trainset_ratio = std::max( + 1, n_rows / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); + auto n_rows_train = n_rows / trainset_ratio; + rmm::device_uvector trainset(n_rows_train * index.dim(), stream); + // TODO: a proper sampling + RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), + sizeof(T) * index.dim(), + dataset, + sizeof(T) * index.dim() * trainset_ratio, + sizeof(T) * index.dim(), + n_rows_train, + cudaMemcpyDefault, + stream)); + kmeans::build_hierarchical(handle, + params.kmeans_n_iters, + index.dim(), + trainset.data(), + n_rows_train, + index.centers().data_handle(), + index.n_lists(), + index.metric(), + stream); } - auto n_lists = static_cast(params.n_lists); - - // kmeans cluster ids for the dataset - auto&& centers = rmm::device_uvector(size_t(n_lists) * size_t(dim), stream); - - // Predict labels of the whole dataset - kmeans::build_optimized_kmeans(handle, - params.kmeans_n_iters, - dim, - dataset, - n_rows, - centers.data(), - n_lists, - params.kmeans_trainset_fraction, - params.metric, - stream); - - auto&& data = rmm::device_uvector(0, stream); - auto&& indices = rmm::device_uvector(0, stream); - auto&& list_sizes = rmm::device_uvector(n_lists, stream); - auto&& list_offsets = rmm::device_uvector(n_lists + 1, stream); - utils::memzero(list_sizes.data(), list_sizes.size(), stream); - utils::memzero(list_offsets.data(), list_offsets.size(), stream); - - // assemble the index - index index{{}, - veclen, - params.metric, - IdxT(0), - dim, - n_lists, - std::move(data), - std::move(indices), - std::move(list_sizes), - std::move(list_offsets), - std::move(centers), - std::nullopt}; - - // check index invariants - index.check_consistency(); // add the data if necessary if (params.add_data_on_build) { - return extend(handle, index, dataset, nullptr, n_rows, stream); + return detail::extend(handle, index, dataset, nullptr, n_rows); } else { return index; } diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh index a52fbc69de..5b3b2129f7 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh @@ -18,25 +18,23 @@ #include "../ivf_flat_types.hpp" #include "ann_utils.cuh" -#include "topk/radix_topk.cuh" +#include "topk.cuh" #include "topk/warpsort_topk.cuh" -#include #include #include #include #include -#include #include -#include -#include -#include +#include +#include +#include +#include +#include #include #include -#include - namespace raft::spatial::knn::ivf_flat::detail { using namespace raft::spatial::knn::detail; // NOLINT @@ -698,8 +696,8 @@ __global__ void __launch_bounds__(kThreadsPerBlock) copy_vectorized(query_shared, query, std::min(dim, query_smem_elems)); __syncthreads(); - topk::block_sort queue( - k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T)); + using block_sort_t = topk::block_sort; + block_sort_t queue(k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T)); { using align_warp = Pow2; @@ -768,8 +766,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock) } // Enqueue one element per thread - constexpr float kDummy = Ascending ? upper_bound() : lower_bound(); - const float val = valid ? static_cast(dist) : kDummy; + const float val = valid ? static_cast(dist) : block_sort_t::queue_t::kDummy; const size_t idx = valid ? static_cast(list_indices[list_offset + vec_id]) : 0; queue.add(val, idx); } @@ -819,16 +816,16 @@ void launch_kernel(Lambda lambda, uint32_t& grid_dim_x, rmm::cuda_stream_view stream) { - RAFT_EXPECTS(Veclen == index.veclen, + RAFT_EXPECTS(Veclen == index.veclen(), "Configured Veclen does not match the index interleaving pattern."); constexpr auto kKernel = interleaved_scan_kernel; const int max_query_smem = 16384; int query_smem_elems = - std::min(max_query_smem / sizeof(T), Pow2::roundUp(index.dim)); + std::min(max_query_smem / sizeof(T), Pow2::roundUp(index.dim())); int smem_size = query_smem_elems * sizeof(T); constexpr int kSubwarpSize = std::min(Capacity, WarpSize); - smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide( + smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide( kThreadsPerBlock / kSubwarpSize, k); // power-of-two less than cuda limit (for better addr alignment) @@ -855,16 +852,16 @@ void launch_kernel(Lambda lambda, query_smem_elems, queries, coarse_index, - index.indices.data(), - index.data.data(), - index.list_sizes.data(), - index.list_offsets.data(), + index.indices().data_handle(), + index.data().data_handle(), + index.list_sizes().data_handle(), + index.list_offsets().data_handle(), n_probes, k, - index.dim, + index.dim(), neighbors, distances); - queries += grid_dim_y * index.dim; + queries += grid_dim_y * index.dim(); neighbors += grid_dim_y * grid_dim_x * k; distances += grid_dim_y * grid_dim_x * k; } @@ -1041,7 +1038,7 @@ void ivfflat_interleaved_scan(const ivf_flat::index& index, { const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k); select_interleaved_scan_kernel::run(capacity, - index.veclen, + index.veclen(), select_min, metric, index, @@ -1066,25 +1063,25 @@ void search_impl(const handle_t& handle, bool select_min, IdxT* neighbors, AccT* distances, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* search_mr) { + auto stream = handle.get_stream(); // The norm of query rmm::device_uvector query_norm_dev(n_queries, stream, search_mr); // The distance value of cluster(list) and queries - rmm::device_uvector distance_buffer_dev(n_queries * index.n_lists, stream, search_mr); + rmm::device_uvector distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr); // The topk distance value of cluster(list) and queries rmm::device_uvector coarse_distances_dev(n_queries * n_probes, stream, search_mr); // The topk index of cluster(list) and queries rmm::device_uvector coarse_indices_dev(n_queries * n_probes, stream, search_mr); - // The topk distance value of candicate vectors from each cluster(list) + // The topk distance value of candidate vectors from each cluster(list) rmm::device_uvector refined_distances_dev(n_queries * n_probes * k, stream, search_mr); - // The topk index of candicate vectors from each cluster(list) + // The topk index of candidate vectors from each cluster(list) rmm::device_uvector refined_indices_dev(n_queries * n_probes * k, stream, search_mr); size_t float_query_size; if constexpr (std::is_integral_v) { - float_query_size = n_queries * index.dim; + float_query_size = n_queries * index.dim(); } else { float_query_size = 0; } @@ -1095,25 +1092,26 @@ void search_impl(const handle_t& handle, converted_queries_ptr = const_cast(queries); } else { linalg::unaryOp( - converted_queries_ptr, queries, n_queries * index.dim, utils::mapping{}, stream); + converted_queries_ptr, queries, n_queries * index.dim(), utils::mapping{}, stream); } float alpha = 1.0f; float beta = 0.0f; - if (index.metric == raft::distance::DistanceType::L2Expanded) { + // todo(lsugy): raft distance? (if performance is similar/better than gemm) + if (index.metric() == raft::distance::DistanceType::L2Expanded) { alpha = -2.0f; beta = 1.0f; utils::dots_along_rows( - n_queries, index.dim, converted_queries_ptr, query_norm_dev.data(), stream); + n_queries, index.dim(), converted_queries_ptr, query_norm_dev.data(), stream); utils::outer_add(query_norm_dev.data(), - n_queries, - index.center_norms->data(), - index.n_lists, + (IdxT)n_queries, + index.center_norms()->data_handle(), + (IdxT)index.n_lists(), distance_buffer_dev.data(), stream); - RAFT_LOG_TRACE_VEC(index.center_norms->data(), 20); - RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20); + RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min(20, index.dim())); + RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min(20, index.n_lists())); } else { alpha = 1.0f; beta = 0.0f; @@ -1122,45 +1120,32 @@ void search_impl(const handle_t& handle, linalg::gemm(handle, true, false, - index.n_lists, + index.n_lists(), n_queries, - index.dim, + index.dim(), &alpha, - index.centers.data(), - index.dim, + index.centers().data_handle(), + index.dim(), converted_queries_ptr, - index.dim, + index.dim(), &beta, distance_buffer_dev.data(), - index.n_lists, + index.n_lists(), stream); - RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20); - if (n_probes <= raft::spatial::knn::detail::topk::kMaxCapacity) { - topk::warp_sort_topk(distance_buffer_dev.data(), - nullptr, - n_queries, - index.n_lists, - n_probes, - coarse_distances_dev.data(), - coarse_indices_dev.data(), - select_min, - stream, - search_mr); - } else { - topk::radix_topk(distance_buffer_dev.data(), - nullptr, - n_queries, - index.n_lists, - n_probes, - coarse_distances_dev.data(), - coarse_indices_dev.data(), - select_min, - stream, - search_mr); - } - RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * n_probes); - RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * n_probes); + RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min(20, index.n_lists())); + select_topk(distance_buffer_dev.data(), + nullptr, + n_queries, + index.n_lists(), + n_probes, + coarse_distances_dev.data(), + coarse_indices_dev.data(), + select_min, + stream, + search_mr); + RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes); + RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes); auto distances_dev_ptr = refined_distances_dev.data(); auto indices_dev_ptr = refined_indices_dev.data(); @@ -1172,7 +1157,7 @@ void search_impl(const handle_t& handle, nullptr, nullptr, n_queries, - index.metric, + index.metric(), n_probes, k, select_min, @@ -1193,7 +1178,7 @@ void search_impl(const handle_t& handle, queries, coarse_indices_dev.data(), n_queries, - index.metric, + index.metric(), n_probes, k, select_min, @@ -1207,31 +1192,16 @@ void search_impl(const handle_t& handle, // Merge topk values from different blocks if (grid_dim_x > 1) { - if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) { - topk::warp_sort_topk(refined_distances_dev.data(), - refined_indices_dev.data(), - n_queries, - k * grid_dim_x, - k, - distances, - neighbors, - select_min, - stream, - search_mr); - } else { - // NB: this branch can only be triggered once `ivfflat_interleaved_scan` above supports larger - // `k` values (kMaxCapacity limit as a dependency of topk::block_sort) - topk::radix_topk(refined_distances_dev.data(), - refined_indices_dev.data(), - n_queries, - k * grid_dim_x, - k, - distances, - neighbors, - select_min, - stream, - search_mr); - } + select_topk(refined_distances_dev.data(), + refined_indices_dev.data(), + n_queries, + k * grid_dim_x, + k, + distances, + neighbors, + select_min, + stream, + search_mr); } } @@ -1245,23 +1215,22 @@ inline void search(const handle_t& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = nullptr) { common::nvtx::range fun_scope( - "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim); + "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim()); RAFT_EXPECTS(params.n_probes > 0, "n_probes (number of clusters to probe in the search) must be positive."); - auto n_probes = std::min(params.n_probes, index.n_lists); + auto n_probes = std::min(params.n_probes, index.n_lists()); bool select_min; - switch (index.metric) { + switch (index.metric()) { case raft::distance::DistanceType::InnerProduct: case raft::distance::DistanceType::CosineExpanded: case raft::distance::DistanceType::CorrelationExpanded: // Similarity metrics have the opposite meaning, i.e. nearest neigbours are those with larger - // similarity (See the same logic at cpp/include/raft/sparse/selection/detail/knn.cuh:362 + // similarity (See the same logic at cpp/include/raft/sparse/spatial/detail/knn.cuh:362 // {perform_k_selection}) select_min = false; break; @@ -1275,7 +1244,7 @@ inline void search(const handle_t& handle, } return search_impl( - handle, index, queries, n_queries, k, n_probes, select_min, neighbors, distances, stream, mr); + handle, index, queries, n_queries, k, n_probes, select_min, neighbors, distances, mr); } } // namespace raft::spatial::knn::ivf_flat::detail diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh new file mode 100644 index 0000000000..f13dcd8cc6 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh @@ -0,0 +1,1076 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../ivf_pq_types.hpp" +#include "ann_kmeans_balanced.cuh" +#include "ann_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace raft::spatial::knn::ivf_pq::detail { + +using namespace raft::spatial::knn::detail; // NOLINT + +namespace { + +/** + * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`. + * + * @tparam Bits number of bits comprising the value. + */ +template +struct bitfield_ref_t { + static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte"); + constexpr static uint8_t kMask = static_cast((1u << Bits) - 1u); + uint8_t* ptr; + uint32_t offset; + + constexpr operator uint8_t() // NOLINT + { + auto pair = static_cast(ptr[0]); + if (offset + Bits > 8) { pair |= static_cast(ptr[1]) << 8; } + return static_cast((pair >> offset) & kMask); + } + + constexpr auto operator=(uint8_t code) -> bitfield_ref_t& + { + if (offset + Bits > 8) { + auto pair = static_cast(ptr[0]); + pair |= static_cast(ptr[1]) << 8; + pair &= ~(static_cast(kMask) << offset); + pair |= static_cast(code) << offset; + ptr[0] = static_cast(Pow2<256>::mod(pair)); + ptr[1] = static_cast(Pow2<256>::div(pair)); + } else { + ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset); + } + return *this; + } +}; + +/** + * View a byte array as an array of unsigned integers of custom small bit size. + * + * @tparam Bits number of bits comprising a single element of the array. + */ +template +struct bitfield_view_t { + static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte"); + uint8_t* raw; + + constexpr auto operator[](uint32_t i) -> bitfield_ref_t + { + uint32_t bit_offset = i * Bits; + return bitfield_ref_t{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)}; + } +}; + +/* + NB: label type is uint32_t although it can only contain values up to `1 << pq_bits`. + We keep it this way to not force one more overload for kmeans::predict. + */ +template +HDI void ivfpq_encode_core(uint32_t n_rows, uint32_t pq_dim, const uint32_t* label, uint8_t* output) +{ + bitfield_view_t out{output}; + for (uint32_t j = 0; j < pq_dim; j++, label += n_rows) { + out[j] = static_cast(*label); + } +} + +template +__launch_bounds__(BlockDim) __global__ + void ivfpq_encode_kernel(uint32_t n_rows, + uint32_t pq_dim, + const uint32_t* label, // [pq_dim, n_rows] + uint8_t* output // [n_rows, pq_dim] + ) +{ + uint32_t i = threadIdx.x + BlockDim * blockIdx.x; + if (i >= n_rows) return; + ivfpq_encode_core(n_rows, pq_dim, label + i, output + (pq_dim * PqBits / 8) * i); +} +} // namespace + +inline void ivfpq_encode(uint32_t n_rows, + uint32_t pq_dim, + uint32_t pq_bits, // 4 <= pq_bits <= 8 + const uint32_t* label, // [pq_dim, n_rows] + uint8_t* output, // [n_rows, pq_dim] + rmm::cuda_stream_view stream) +{ + constexpr uint32_t kBlockDim = 128; + dim3 threads(kBlockDim, 1, 1); + dim3 blocks(raft::ceildiv(n_rows, kBlockDim), 1, 1); + switch (pq_bits) { + case 4: + return ivfpq_encode_kernel + <<>>(n_rows, pq_dim, label, output); + case 5: + return ivfpq_encode_kernel + <<>>(n_rows, pq_dim, label, output); + case 6: + return ivfpq_encode_kernel + <<>>(n_rows, pq_dim, label, output); + case 7: + return ivfpq_encode_kernel + <<>>(n_rows, pq_dim, label, output); + case 8: + return ivfpq_encode_kernel + <<>>(n_rows, pq_dim, label, output); + default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits); + } +} + +/** + * @brief Fill-in a random orthogonal transformation matrix. + * + * @param handle + * @param force_random_rotation + * @param n_rows + * @param n_cols + * @param[out] rotation_matrix device pointer to a row-major matrix of size [n_rows, n_cols]. + * @param rng random number generator state + */ +inline void make_rotation_matrix(const handle_t& handle, + bool force_random_rotation, + uint32_t n_rows, + uint32_t n_cols, + float* rotation_matrix, + raft::random::Rng rng = raft::random::Rng(7ULL)) +{ + common::nvtx::range fun_scope( + "ivf_pq::make_rotation_matrix(%u * %u)", n_rows, n_cols); + auto stream = handle.get_stream(); + bool inplace = n_rows == n_cols; + uint32_t n = std::max(n_rows, n_cols); + if (force_random_rotation || !inplace) { + rmm::device_uvector buf(inplace ? 0 : n * n, stream); + float* mat = inplace ? rotation_matrix : buf.data(); + rng.normal(mat, n * n, 0.0f, 1.0f, stream); + linalg::detail::qrGetQ_inplace(handle, mat, n, n, stream); + if (!inplace) { + RAFT_CUDA_TRY(cudaMemcpy2DAsync(rotation_matrix, + sizeof(float) * n_cols, + mat, + sizeof(float) * n, + sizeof(float) * n_cols, + n_rows, + cudaMemcpyDefault, + stream)); + } + } else { + uint32_t stride = n + 1; + auto f = [stride] __device__(float* out, uint32_t i) -> void { *out = float(i % stride == 0); }; + linalg::writeOnlyUnaryOp(rotation_matrix, n * n, f, stream); + } +} + +/** + * @brief Compute residual vectors from the source dataset given by selected indices. + * + * The residual has the form `rotation_matrix %* (dataset[row_ids, :] - center)` + * + */ +template +void select_residuals(const handle_t& handle, + float* residuals, + IdxT n_rows, + uint32_t dim, + uint32_t rot_dim, + const float* rotation_matrix, // [rot_dim, dim] + const float* center, // [dim] + const T* dataset, // [.., dim] + const IdxT* row_ids, // [n_rows] + rmm::mr::device_memory_resource* device_memory + +) +{ + auto stream = handle.get_stream(); + rmm::device_uvector tmp(n_rows * dim, stream, device_memory); + utils::copy_selected( + n_rows, (IdxT)dim, dataset, row_ids, (IdxT)dim, tmp.data(), (IdxT)dim, stream); + + raft::matrix::linewiseOp( + tmp.data(), + tmp.data(), + IdxT(dim), + n_rows, + true, + [] __device__(float a, float b) { return a - b; }, + stream, + center); + + float alpha = 1.0; + float beta = 0.0; + linalg::gemm(handle, + true, + false, + rot_dim, + n_rows, + dim, + &alpha, + rotation_matrix, + dim, + tmp.data(), + dim, + &beta, + residuals, + rot_dim, + stream); +} + +/** + * @param handle, + * @param n_rows + * @param data_dim + * @param rot_dim + * @param pq_dim + * @param pq_len + * @param pq_bits + * @param n_clusters + * @param codebook_kind + * @param max_cluster_size + * @param cluster_centers // [n_clusters, data_dim] + * @param rotation_matrix // [rot_dim, data_dim] + * @param dataset // [n_rows] + * @param data_indices + * tells which indices to select in the dataset for each cluster [n_rows]; + * it should be partitioned by the clusters by now. + * @param cluster_sizes // [n_clusters] + * @param cluster_offsets // [n_clusters + 1] + * @param pq_centers // [...] + * @param pq_dataset // [n_rows, pq_dim * pq_bits / 8] + * @param device_memory + */ +template +void compute_pq_codes(const handle_t& handle, + IdxT n_rows, + uint32_t data_dim, + uint32_t rot_dim, + uint32_t pq_dim, + uint32_t pq_len, + uint32_t pq_bits, + uint32_t n_clusters, + codebook_gen codebook_kind, + uint32_t max_cluster_size, + float* cluster_centers, + const float* rotation_matrix, + const T* dataset, + const IdxT* data_indices, + const uint32_t* cluster_sizes, + const IdxT* cluster_offsets, + const float* pq_centers, + uint8_t* pq_dataset, + rmm::mr::device_memory_resource* device_memory) +{ + common::nvtx::range fun_scope( + "ivf_pq::compute_pq_codes(n_rows = %zu, data_dim = %u, rot_dim = %u (%u * %u), n_clusters = " + "%u)", + size_t(n_rows), + data_dim, + rot_dim, + pq_dim, + pq_len, + n_clusters); + auto stream = handle.get_stream(); + + // + // Compute PQ code + // + utils::memzero(pq_dataset, n_rows * pq_dim * pq_bits / 8, stream); + + rmm::device_uvector rot_vectors(max_cluster_size * rot_dim, stream, device_memory); + rmm::device_uvector sub_vectors(max_cluster_size * pq_dim * pq_len, stream, device_memory); + rmm::device_uvector sub_vector_labels(max_cluster_size * pq_dim, stream, device_memory); + rmm::device_uvector my_pq_dataset( + max_cluster_size * pq_dim * pq_bits / 8 /* NB: pq_dim * bitPQ % 8 == 0 */, + stream, + device_memory); + + for (uint32_t l = 0; l < n_clusters; l++) { + auto cluster_size = cluster_sizes[l]; + common::nvtx::range cluster_scope( + "ivf_pq::compute_pq_codes::cluster[%u](size = %u)", l, cluster_size); + if (cluster_size == 0) continue; + + select_residuals(handle, + rot_vectors.data(), + IdxT(cluster_size), + data_dim, + rot_dim, + rotation_matrix, + cluster_centers + uint64_t(l) * data_dim, + dataset, + data_indices + cluster_offsets[l], + device_memory); + + // + // Change the order of the vector data to facilitate processing in + // each vector subspace. + // input: rot_vectors[cluster_size, rot_dim] = [cluster_size, pq_dim, pq_len] + // output: sub_vectors[pq_dim, cluster_size, pq_len] + // + for (uint32_t i = 0; i < pq_dim; i++) { + RAFT_CUDA_TRY(cudaMemcpy2DAsync(sub_vectors.data() + i * pq_len * cluster_size, + sizeof(float) * pq_len, + rot_vectors.data() + i * pq_len, + sizeof(float) * rot_dim, + sizeof(float) * pq_len, + cluster_size, + cudaMemcpyDefault, + stream)); + } + + // + // Find a label (cluster ID) for each vector subspace. + // + for (uint32_t j = 0; j < pq_dim; j++) { + const float* sub_pq_centers = nullptr; + switch (codebook_kind) { + case codebook_gen::PER_SUBSPACE: + sub_pq_centers = pq_centers + ((1 << pq_bits) * pq_len) * j; + break; + case codebook_gen::PER_CLUSTER: + sub_pq_centers = pq_centers + ((1 << pq_bits) * pq_len) * l; + break; + default: RAFT_FAIL("Unreachable code"); + } + kmeans::predict(handle, + sub_pq_centers, + (1 << pq_bits), + pq_len, + sub_vectors.data() + j * (cluster_size * pq_len), + cluster_size, + sub_vector_labels.data() + j * cluster_size, + raft::distance::DistanceType::L2Expanded, + stream, + device_memory); + } + + // + // PQ encoding + // + ivfpq_encode( + cluster_size, pq_dim, pq_bits, sub_vector_labels.data(), my_pq_dataset.data(), stream); + copy(pq_dataset + cluster_offsets[l] * uint64_t{pq_dim * pq_bits / 8}, + my_pq_dataset.data(), + cluster_size * pq_dim * pq_bits / 8, + stream); + } +} + +template +__launch_bounds__(BlockDim) __global__ void fill_indices_kernel(IdxT n_rows, + IdxT* data_indices, + IdxT* data_offsets, + const uint32_t* labels) +{ + const auto i = BlockDim * IdxT(blockIdx.x) + IdxT(threadIdx.x); + if (i >= n_rows) { return; } + data_indices[atomicAdd(data_offsets + labels[i], 1)] = i; +} + +/** + * @brief Calculate cluster offsets and arrange data indices into clusters. + * + * @param n_rows + * @param n_lists + * @param[in] labels output of k-means prediction [n_rows] + * @param[in] cluster_sizes [n_lists] + * @param[out] cluster_offsets [n_lists+1] + * @param[out] data_indices [n_rows] + * + * @return size of the largest cluster + */ +template +auto calculate_offsets_and_indices(IdxT n_rows, + uint32_t n_lists, + const uint32_t* labels, + const uint32_t* cluster_sizes, + IdxT* cluster_offsets, + IdxT* data_indices, + rmm::cuda_stream_view stream) -> uint32_t +{ + auto exec_policy = rmm::exec_policy(stream); + uint32_t max_cluster_size = 0; + rmm::device_scalar max_cluster_size_dev_buf(stream); + auto max_cluster_size_dev = max_cluster_size_dev_buf.data(); + update_device(max_cluster_size_dev, &max_cluster_size, 1, stream); + // Calculate the offsets + IdxT cumsum = 0; + update_device(cluster_offsets, &cumsum, 1, stream); + thrust::inclusive_scan(exec_policy, + cluster_sizes, + cluster_sizes + n_lists, + cluster_offsets + 1, + [max_cluster_size_dev] __device__(IdxT s, uint32_t l) { + atomicMax(max_cluster_size_dev, l); + return s + l; + }); + update_host(&cumsum, cluster_offsets + n_lists, 1, stream); + update_host(&max_cluster_size, max_cluster_size_dev, 1, stream); + stream.synchronize(); + RAFT_EXPECTS(cumsum == n_rows, "cluster sizes do not add up."); + rmm::device_uvector data_offsets_buf(n_lists, stream); + auto data_offsets = data_offsets_buf.data(); + copy(data_offsets, cluster_offsets, n_lists, stream); + constexpr uint32_t n_threads = 128; // NOLINT + const IdxT n_blocks = raft::div_rounding_up_unsafe(n_rows, n_threads); + fill_indices_kernel + <<>>(n_rows, data_indices, data_offsets, labels); + return max_cluster_size; +} + +template +void train_per_subset(const handle_t& handle, + index& index, + IdxT n_rows, + const float* trainset, // [n_rows, dim] + const uint32_t* labels, // [n_rows] + uint32_t kmeans_n_iters, + rmm::mr::device_memory_resource* managed_memory, + rmm::mr::device_memory_resource* device_memory) +{ + auto stream = handle.get_stream(); + + rmm::device_uvector sub_trainset(n_rows * index.pq_len(), stream, device_memory); + rmm::device_uvector sub_labels(n_rows, stream, device_memory); + + rmm::device_uvector pq_cluster_sizes(index.pq_book_size(), stream, device_memory); + + for (uint32_t j = 0; j < index.pq_dim(); j++) { + common::nvtx::range pq_per_subspace_scope( + "ivf_pq::build::per_subspace[%u]", j); + + // Get the rotated cluster centers for each training vector. + // This will be subtracted from the input vectors afterwards. + utils::copy_selected(n_rows, + (IdxT)index.pq_len(), + index.centers_rot().data_handle() + index.pq_len() * j, + labels, + (IdxT)index.rot_dim(), + sub_trainset.data(), + (IdxT)index.pq_len(), + stream); + + // sub_trainset is the slice of: rotate(trainset) - centers_rot + float alpha = 1.0; + float beta = -1.0; + linalg::gemm(handle, + true, + false, + index.pq_len(), + n_rows, + index.dim(), + &alpha, + index.rotation_matrix().data_handle() + index.dim() * index.pq_len() * j, + index.dim(), + trainset, + index.dim(), + &beta, + sub_trainset.data(), + index.pq_len(), + stream); + + // train PQ codebook for this subspace + kmeans::build_clusters( + handle, + kmeans_n_iters, + index.pq_len(), + sub_trainset.data(), + n_rows, + index.pq_book_size(), + index.pq_centers().data_handle() + (index.pq_book_size() * index.pq_len()) * j, + sub_labels.data(), + pq_cluster_sizes.data(), + raft::distance::DistanceType::L2Expanded, + stream, + device_memory); + } +} + +template +void train_per_cluster(const handle_t& handle, + index& index, + IdxT n_rows, + const float* trainset, // [n_rows, dim] + const uint32_t* labels, // [n_rows] + uint32_t kmeans_n_iters, + rmm::mr::device_memory_resource* managed_memory, + rmm::mr::device_memory_resource* device_memory) +{ + auto stream = handle.get_stream(); + rmm::device_uvector cluster_sizes(index.n_lists(), stream, managed_memory); + rmm::device_uvector indices_buf(n_rows, stream, device_memory); + rmm::device_uvector offsets_buf(index.list_offsets().size(), stream, managed_memory); + + raft::stats::histogram(raft::stats::HistTypeAuto, + reinterpret_cast(cluster_sizes.data()), + IdxT(index.n_lists()), + labels, + n_rows, + 1, + stream); + + auto cluster_offsets = offsets_buf.data(); + auto indices = indices_buf.data(); + uint32_t max_cluster_size = calculate_offsets_and_indices( + n_rows, index.n_lists(), labels, cluster_sizes.data(), cluster_offsets, indices, stream); + + rmm::device_uvector pq_labels(max_cluster_size * index.pq_dim(), stream, device_memory); + rmm::device_uvector pq_cluster_sizes(index.pq_book_size(), stream, device_memory); + rmm::device_uvector rot_vectors(max_cluster_size * index.rot_dim(), stream, device_memory); + + handle.sync_stream(); // make sure cluster offsets are up-to-date + for (uint32_t l = 0; l < index.n_lists(); l++) { + auto cluster_size = cluster_sizes.data()[l]; + if (cluster_size == 0) continue; + common::nvtx::range pq_per_cluster_scope( + "ivf_pq::build::per_cluster[%u](size = %u)", l, cluster_size); + + select_residuals(handle, + rot_vectors.data(), + IdxT(cluster_size), + index.dim(), + index.rot_dim(), + index.rotation_matrix().data_handle(), + index.centers().data_handle() + uint64_t(l) * index.dim_ext(), + trainset, + indices + cluster_offsets[l], + device_memory); + + // limit the cluster size to bound the training time. + // [sic] we interpret the data as pq_len-dimensional + size_t big_enough = 256 * std::max(index.pq_book_size(), index.pq_dim()); + size_t available_rows = cluster_size * index.pq_dim(); + auto pq_n_rows = uint32_t(std::min(big_enough, available_rows)); + // train PQ codebook for this cluster + kmeans::build_clusters( + handle, + kmeans_n_iters, + index.pq_len(), + rot_vectors.data(), + pq_n_rows, + index.pq_book_size(), + index.pq_centers().data_handle() + index.pq_book_size() * index.pq_len() * l, + pq_labels.data(), + pq_cluster_sizes.data(), + raft::distance::DistanceType::L2Expanded, + stream, + device_memory); + } +} + +/** See raft::spatial::knn::ivf_pq::extend docs */ +template +inline auto extend(const handle_t& handle, + const index& orig_index, + const T* new_vectors, + const IdxT* new_indices, + IdxT n_rows) -> index +{ + common::nvtx::range fun_scope( + "ivf_pq::extend(%zu, %u)", size_t(n_rows), orig_index.dim()); + auto stream = handle.get_stream(); + + RAFT_EXPECTS(new_indices != nullptr || orig_index.size() == 0, + "You must pass data indices when the index is non-empty."); + + static_assert(std::is_same_v || std::is_same_v || std::is_same_v, + "Unsupported data type"); + + rmm::mr::device_memory_resource* device_memory = nullptr; + auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); + if (pool_guard) { + RAFT_LOG_DEBUG("ivf_pq::extend: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + + rmm::mr::managed_memory_resource managed_memory_upstream; + rmm::mr::pool_memory_resource managed_memory( + &managed_memory_upstream, 1024 * 1024); + + // + // The cluster_centers stored in index contain data other than cluster + // centroids to speed up the search. Here, only the cluster centroids + // are extracted. + // + const auto n_clusters = orig_index.n_lists(); + + rmm::device_uvector cluster_centers(n_clusters * orig_index.dim(), stream, device_memory); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(), + sizeof(float) * orig_index.dim(), + orig_index.centers().data_handle(), + sizeof(float) * orig_index.dim_ext(), + sizeof(float) * orig_index.dim(), + n_clusters, + cudaMemcpyDefault, + stream)); + + // + // Use the existing cluster centroids to find the label (cluster ID) + // of the vector to be added. + // + + rmm::device_uvector new_data_labels(n_rows, stream, device_memory); + utils::memzero(new_data_labels.data(), n_rows, stream); + rmm::device_uvector new_cluster_sizes_buf(n_clusters, stream, &managed_memory); + auto new_cluster_sizes = new_cluster_sizes_buf.data(); + utils::memzero(new_cluster_sizes, n_clusters, stream); + + kmeans::predict(handle, + cluster_centers.data(), + n_clusters, + orig_index.dim(), + new_vectors, + n_rows, + new_data_labels.data(), + orig_index.metric(), + stream); + raft::stats::histogram(raft::stats::HistTypeAuto, + reinterpret_cast(new_cluster_sizes), + IdxT(n_clusters), + new_data_labels.data(), + n_rows, + 1, + stream); + + // + // Make new_cluster_offsets, new_data_indices + // + rmm::device_uvector new_data_indices(n_rows, stream, &managed_memory); + rmm::device_uvector new_cluster_offsets(n_clusters + 1, stream, &managed_memory); + uint32_t new_max_cluster_size = calculate_offsets_and_indices(n_rows, + n_clusters, + new_data_labels.data(), + new_cluster_sizes, + new_cluster_offsets.data(), + new_data_indices.data(), + stream); + + // + // Compute PQ code for new vectors + // + rmm::device_uvector new_pq_codes( + n_rows * orig_index.pq_dim() * orig_index.pq_bits() / 8, stream, device_memory); + compute_pq_codes(handle, + n_rows, + orig_index.dim(), + orig_index.rot_dim(), + orig_index.pq_dim(), + orig_index.pq_len(), + orig_index.pq_bits(), + n_clusters, + orig_index.codebook_kind(), + new_max_cluster_size, + cluster_centers.data(), + orig_index.rotation_matrix().data_handle(), + new_vectors, + new_data_indices.data(), + new_cluster_sizes, + new_cluster_offsets.data(), + orig_index.pq_centers().data_handle(), + new_pq_codes.data(), + device_memory); + + // Get the combined cluster sizes and sort the clusters in decreasing order + // (this makes it easy to estimate the max number of samples during search). + rmm::device_uvector old_cluster_sizes_buf(n_clusters, stream, &managed_memory); + rmm::device_uvector ext_cluster_sizes_buf(n_clusters, stream, &managed_memory); + rmm::device_uvector old_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory); + rmm::device_uvector ext_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory); + rmm::device_uvector cluster_ordering(n_clusters, stream, &managed_memory); + auto old_cluster_sizes = old_cluster_sizes_buf.data(); + auto ext_cluster_sizes = ext_cluster_sizes_buf.data(); + auto old_cluster_offsets = old_cluster_offsets_buf.data(); + auto ext_cluster_offsets = ext_cluster_offsets_buf.data(); + copy(old_cluster_offsets, + orig_index.list_offsets().data_handle(), + orig_index.list_offsets().size(), + stream); + + uint32_t n_nonempty_lists = 0; + { + rmm::device_uvector ext_cluster_sizes_buf_in(n_clusters, stream, device_memory); + rmm::device_uvector cluster_ordering_in(n_clusters, stream, device_memory); + auto ext_cluster_sizes_in = ext_cluster_sizes_buf_in.data(); + linalg::writeOnlyUnaryOp( + old_cluster_sizes, + n_clusters, + [ext_cluster_sizes_in, new_cluster_sizes, old_cluster_offsets] __device__(uint32_t * out, + size_t i) { + auto old_size = old_cluster_offsets[i + 1] - old_cluster_offsets[i]; + ext_cluster_sizes_in[i] = old_size + new_cluster_sizes[i]; + *out = old_size; + }, + stream); + + thrust::sequence(handle.get_thrust_policy(), + cluster_ordering_in.data(), + cluster_ordering_in.data() + n_clusters); + + int begin_bit = 0; + int end_bit = sizeof(uint32_t) * 8; + size_t cub_workspace_size = 0; + cub::DeviceRadixSort::SortPairsDescending(nullptr, + cub_workspace_size, + ext_cluster_sizes_in, + ext_cluster_sizes, + cluster_ordering_in.data(), + cluster_ordering.data(), + n_clusters, + begin_bit, + end_bit, + stream); + rmm::device_buffer cub_workspace(cub_workspace_size, stream, device_memory); + cub::DeviceRadixSort::SortPairsDescending(cub_workspace.data(), + cub_workspace_size, + ext_cluster_sizes_in, + ext_cluster_sizes, + cluster_ordering_in.data(), + cluster_ordering.data(), + n_clusters, + begin_bit, + end_bit, + stream); + + n_nonempty_lists = thrust::lower_bound(handle.get_thrust_policy(), + ext_cluster_sizes, + ext_cluster_sizes + n_clusters, + 0, + thrust::greater()) - + ext_cluster_sizes; + } + + // Assemble the extended index + ivf_pq::index ext_index(handle, + orig_index.metric(), + orig_index.codebook_kind(), + n_clusters, + orig_index.dim(), + orig_index.pq_bits(), + orig_index.pq_dim(), + n_nonempty_lists); + ext_index.allocate(handle, orig_index.size() + n_rows); + + // Copy the unchanged parts + copy(ext_index.rotation_matrix().data_handle(), + orig_index.rotation_matrix().data_handle(), + orig_index.rotation_matrix().size(), + stream); + + // calculate extended cluster offsets + auto ext_indices = ext_index.indices().data_handle(); + { + IdxT zero = 0; + update_device(ext_cluster_offsets, &zero, 1, stream); + thrust::inclusive_scan(handle.get_thrust_policy(), + ext_cluster_sizes, + ext_cluster_sizes + n_clusters, + ext_cluster_offsets + 1, + [] __device__(IdxT s, uint32_t l) { return s + l; }); + copy(ext_index.list_offsets().data_handle(), + ext_cluster_offsets, + ext_index.list_offsets().size(), + stream); + } + + // copy cluster-ordering-dependent data + utils::copy_selected(n_clusters, + ext_index.dim_ext(), + orig_index.centers().data_handle(), + cluster_ordering.data(), + orig_index.dim_ext(), + ext_index.centers().data_handle(), + ext_index.dim_ext(), + stream); + utils::copy_selected(n_clusters, + ext_index.rot_dim(), + orig_index.centers_rot().data_handle(), + cluster_ordering.data(), + orig_index.rot_dim(), + ext_index.centers_rot().data_handle(), + ext_index.rot_dim(), + stream); + switch (orig_index.codebook_kind()) { + case codebook_gen::PER_SUBSPACE: { + copy(ext_index.pq_centers().data_handle(), + orig_index.pq_centers().data_handle(), + orig_index.pq_centers().size(), + stream); + } break; + case codebook_gen::PER_CLUSTER: { + auto d = orig_index.pq_book_size() * orig_index.pq_len(); + utils::copy_selected(n_clusters, + d, + orig_index.pq_centers().data_handle(), + cluster_ordering.data(), + d, + ext_index.pq_centers().data_handle(), + d, + stream); + } break; + default: RAFT_FAIL("Unreachable code"); + } + + // Make ext_indices + handle.sync_stream(); // make sure cluster sizes are up-to-date + for (uint32_t l = 0; l < ext_index.n_lists(); l++) { + auto k = cluster_ordering.data()[l]; + auto old_cluster_size = old_cluster_sizes[k]; + auto new_cluster_size = new_cluster_sizes[k]; + if (old_cluster_size > 0) { + copy(ext_indices + ext_cluster_offsets[l], + orig_index.indices().data_handle() + old_cluster_offsets[k], + old_cluster_size, + stream); + } + if (new_cluster_size > 0) { + if (new_indices == nullptr) { + // implies the orig index is empty + copy(ext_indices + ext_cluster_offsets[l] + old_cluster_size, + new_data_indices.data() + new_cluster_offsets.data()[k], + new_cluster_size, + stream); + } else { + utils::copy_selected((IdxT)new_cluster_size, + (IdxT)1, + new_indices, + new_data_indices.data() + new_cluster_offsets.data()[k], + (IdxT)1, + ext_indices + ext_cluster_offsets[l] + old_cluster_size, + (IdxT)1, + stream); + } + } + } + + /* Extend the pq_dataset */ + auto ext_pq_dataset = ext_index.pq_dataset().data_handle(); + size_t pq_dataset_unit = ext_index.pq_dim() * ext_index.pq_bits() / 8; + for (uint32_t l = 0; l < ext_index.n_lists(); l++) { + auto k = cluster_ordering.data()[l]; + auto old_cluster_size = old_cluster_sizes[k]; + copy(ext_pq_dataset + pq_dataset_unit * ext_cluster_offsets[l], + orig_index.pq_dataset().data_handle() + pq_dataset_unit * old_cluster_offsets[k], + pq_dataset_unit * old_cluster_size, + stream); + copy(ext_pq_dataset + pq_dataset_unit * (ext_cluster_offsets[l] + old_cluster_size), + new_pq_codes.data() + pq_dataset_unit * new_cluster_offsets.data()[k], + pq_dataset_unit * new_cluster_sizes[k], + stream); + } + + return ext_index; +} + +/** See raft::spatial::knn::ivf_pq::build docs */ +template +inline auto build( + const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim) + -> index +{ + common::nvtx::range fun_scope( + "ivf_pq::build(%zu, %u)", size_t(n_rows), dim); + static_assert(std::is_same_v || std::is_same_v || std::is_same_v, + "Unsupported data type"); + + RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset"); + + auto stream = handle.get_stream(); + + ivf_pq::index index(handle, params, dim); + utils::memzero(index.list_offsets().data_handle(), index.list_offsets().size(), stream); + + auto trainset_ratio = std::max( + 1, n_rows / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); + auto n_rows_train = n_rows / trainset_ratio; + + rmm::mr::device_memory_resource* device_memory = nullptr; + auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); + if (pool_guard) { + RAFT_LOG_DEBUG("ivf_pq::build: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + + rmm::mr::managed_memory_resource managed_memory_upstream; + rmm::mr::pool_memory_resource managed_memory( + &managed_memory_upstream, 1024 * 1024); + + // Besides just sampling, we transform the input dataset into floats to make it easier + // to use gemm operations from cublas. + rmm::device_uvector trainset(n_rows_train * index.dim(), stream, device_memory); + // TODO: a proper sampling + if constexpr (std::is_same_v) { + RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), + sizeof(T) * index.dim(), + dataset, + sizeof(T) * index.dim() * trainset_ratio, + sizeof(T) * index.dim(), + n_rows_train, + cudaMemcpyDefault, + stream)); + } else { + auto dim = index.dim(); + linalg::writeOnlyUnaryOp( + trainset.data(), + index.dim() * n_rows_train, + [dataset, trainset_ratio, dim] __device__(float* out, size_t i) { + auto col = i % dim; + *out = utils::mapping{}(dataset[(i - col) * trainset_ratio + col]); + }, + stream); + } + + // NB: here cluster_centers is used as if it is [n_clusters, data_dim] not [n_clusters, dim_ext]! + rmm::device_uvector cluster_centers_buf( + index.n_lists() * index.dim(), stream, device_memory); + auto cluster_centers = cluster_centers_buf.data(); + + // Train balanced hierarchical kmeans clustering + kmeans::build_hierarchical(handle, + params.kmeans_n_iters, + index.dim(), + trainset.data(), + n_rows_train, + cluster_centers, + index.n_lists(), + index.metric(), + stream); + + // Trainset labels are needed for training PQ codebooks + rmm::device_uvector labels(n_rows_train, stream, device_memory); + kmeans::predict(handle, + cluster_centers, + index.n_lists(), + index.dim(), + trainset.data(), + n_rows_train, + labels.data(), + index.metric(), + stream, + device_memory); + + { + // combine cluster_centers and their norms + RAFT_CUDA_TRY(cudaMemcpy2DAsync(index.centers().data_handle(), + sizeof(float) * index.dim_ext(), + cluster_centers, + sizeof(float) * index.dim(), + sizeof(float) * index.dim(), + index.n_lists(), + cudaMemcpyDefault, + stream)); + + rmm::device_uvector center_norms(index.n_lists(), stream, device_memory); + utils::dots_along_rows( + index.n_lists(), index.dim(), cluster_centers, center_norms.data(), stream); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(index.centers().data_handle() + index.dim(), + sizeof(float) * index.dim_ext(), + center_norms.data(), + sizeof(float), + sizeof(float), + index.n_lists(), + cudaMemcpyDefault, + stream)); + } + + // Make rotation matrix + make_rotation_matrix(handle, + params.force_random_rotation, + index.rot_dim(), + index.dim(), + index.rotation_matrix().data_handle()); + + // Rotate cluster_centers + float alpha = 1.0; + float beta = 0.0; + linalg::gemm(handle, + true, + false, + index.rot_dim(), + index.n_lists(), + index.dim(), + &alpha, + index.rotation_matrix().data_handle(), + index.dim(), + cluster_centers, + index.dim(), + &beta, + index.centers_rot().data_handle(), + index.rot_dim(), + stream); + + // Train PQ codebooks + switch (index.codebook_kind()) { + case codebook_gen::PER_SUBSPACE: + train_per_subset(handle, + index, + n_rows_train, + trainset.data(), + labels.data(), + params.kmeans_n_iters, + &managed_memory, + device_memory); + break; + case codebook_gen::PER_CLUSTER: + train_per_cluster(handle, + index, + n_rows_train, + trainset.data(), + labels.data(), + params.kmeans_n_iters, + &managed_memory, + device_memory); + break; + default: RAFT_FAIL("Unreachable code"); + } + + // add the data if necessary + if (params.add_data_on_build) { + return detail::extend(handle, index, dataset, nullptr, n_rows); + } else { + return index; + } +} + +} // namespace raft::spatial::knn::ivf_pq::detail diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh new file mode 100644 index 0000000000..b1f47a6c52 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -0,0 +1,1395 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../ivf_pq_types.hpp" +#include "ann_utils.cuh" +#include "topk.cuh" +#include "topk/warpsort_topk.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace raft::spatial::knn::ivf_pq::detail { + +/** + * Maximum value of k for the fused calculate & select in ivfpq. + * + * If runtime value of k is larger than this, the main search operation + * is split into two kernels (per batch, first calculate distance, then select top-k). + */ +static constexpr int kMaxCapacity = 128; +static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)), + "kMaxCapacity must be a power of two, not smaller than the WarpSize."); + +using namespace raft::spatial::knn::detail; // NOLINT + +/** 8-bit floating-point storage type. + * + * This is a custom type for the current IVF-PQ implementation. No arithmetic operations defined + * only conversion to and from fp32. This type is unrelated to the proposed FP8 specification. + */ +template +struct fp_8bit { + static_assert(ExpBits + uint8_t{Signed} <= 8, "The type does not fit in 8 bits."); + constexpr static uint32_t ExpMask = (1u << (ExpBits - 1u)) - 1u; // NOLINT + constexpr static uint32_t ValBits = 8u - ExpBits; // NOLINT + + public: + uint8_t bitstring; + + HDI explicit fp_8bit(uint8_t bs) : bitstring(bs) {} + HDI explicit fp_8bit(float fp) : fp_8bit(float2fp_8bit(fp).bitstring) {} + HDI auto operator=(float fp) -> fp_8bit& + { + bitstring = float2fp_8bit(fp).bitstring; + return *this; + } + HDI explicit operator float() const { return fp_8bit2float(*this); } + + private: + static constexpr float kMin = 1.0f / float(1u << ExpMask); + static constexpr float kMax = float(1u << (ExpMask + 1)) * (2.0f - 1.0f / float(1u << ValBits)); + + static HDI auto float2fp_8bit(float v) -> fp_8bit + { + if constexpr (Signed) { + auto u = fp_8bit(std::abs(v)).bitstring; + u = (u & 0xfeu) | uint8_t{v < 0}; // set the sign bit + return fp_8bit(u); + } else { + // sic! all small and negative numbers are truncated to zero. + if (v < kMin) { return fp_8bit{static_cast(0)}; } + // protect from overflow + if (v >= kMax) { return fp_8bit{static_cast(0xffu)}; } + // the rest of possible float values should be within the normalized range + return fp_8bit{static_cast( + (*reinterpret_cast(&v) + (ExpMask << 23u) - 0x3f800000u) >> (15u + ExpBits))}; + } + } + + static HDI auto fp_8bit2float(const fp_8bit& v) -> float + { + uint32_t u = v.bitstring; + if constexpr (Signed) { + u &= ~1; // zero the sign bit + } + float r; + *reinterpret_cast(&r) = + ((u << (15u + ExpBits)) + (0x3f800000u | (0x00400000u >> ValBits)) - (ExpMask << 23)); + if constexpr (Signed) { // recover the sign bit + if (v.bitstring & 1) { r = -r; } + } + return r; + } +}; + +/** + * Select the clusters to probe and, as a side-effect, translate the queries type `T -> float` + * + * Assuming the number of clusters is not that big (a few thousands), we do a plain GEMM + * followed by select_topk to select the clusters to probe. There's no need to return the similarity + * scores here. + */ +template +void select_clusters(const handle_t& handle, + uint32_t* clusters_to_probe, // [n_queries, n_probes] + float* float_queries, // [n_queries, dim_ext] + uint32_t n_queries, + uint32_t n_probes, + uint32_t n_lists, + uint32_t dim, + uint32_t dim_ext, + raft::distance::DistanceType metric, + const T* queries, // [n_queries, dim] + const float* cluster_centers, // [n_lists, dim_ext] + rmm::mr::device_memory_resource* mr) +{ + auto stream = handle.get_stream(); + rmm::device_uvector qc_distances(n_queries * n_lists, stream, mr); + /* NOTE[qc_distances] + + We compute query-center distances to choose the clusters to probe. + We accomplish that with just one GEMM operation thanks to some preprocessing: + + L2 distance: + cluster_centers[i, dim()] contains the squared norm of the center vector i; + we extend the dimension K of the GEMM to compute it together with all the dot products: + + `cq_distances[i, j] = |cluster_centers[j]|^2 - 2 * (queries[i], cluster_centers[j])` + + This is a monotonous mapping of the proper L2 distance. + + IP distance: + `cq_distances[i, j] = - (queries[i], cluster_centers[j])` + + This is a negative inner-product distance. We minimize it to find the similar clusters. + + NB: cq_distances is NOT used further in ivfpq_search. + */ + float norm_factor; + switch (metric) { + case raft::distance::DistanceType::L2Expanded: norm_factor = 1.0 / -2.0; break; + case raft::distance::DistanceType::InnerProduct: norm_factor = 0.0; break; + default: RAFT_FAIL("Unsupported distance type %d.", int(metric)); + } + linalg::writeOnlyUnaryOp( + float_queries, + dim_ext * n_queries, + [queries, dim, dim_ext, norm_factor] __device__(float* out, uint32_t ix) { + uint32_t col = ix % dim_ext; + uint32_t row = ix / dim_ext; + *out = col < dim ? utils::mapping{}(queries[col + dim * row]) : norm_factor; + }, + stream); + + float alpha; + float beta; + uint32_t gemm_k = dim; + switch (metric) { + case raft::distance::DistanceType::L2Expanded: { + alpha = -2.0; + beta = 0.0; + gemm_k = dim + 1; + RAFT_EXPECTS(gemm_k <= dim_ext, "unexpected gemm_k or dim_ext"); + } break; + case raft::distance::DistanceType::InnerProduct: { + alpha = -1.0; + beta = 0.0; + } break; + default: RAFT_FAIL("Unsupported distance type %d.", int(metric)); + } + linalg::gemm(handle, + true, + false, + n_lists, + n_queries, + gemm_k, + &alpha, + cluster_centers, + dim_ext, + float_queries, + dim_ext, + &beta, + qc_distances.data(), + n_lists, + stream); + + // Select neighbor clusters for each query. + rmm::device_uvector cluster_dists(n_queries * n_probes, stream, mr); + select_topk(qc_distances.data(), + nullptr, + n_queries, + n_lists, + n_probes, + cluster_dists.data(), + clusters_to_probe, + true, + stream, + mr); +} + +/** + * For each query, we calculate a cumulative sum of the cluster sizes that we probe, and return that + * in chunk_indices. Essentially this is a segmented inclusive scan of the cluster sizes. The total + * number of samples per query (sum of the cluster sizes that we probe) is returned in n_samples. + */ +template +__launch_bounds__(BlockDim) __global__ + void calc_chunk_indices_kernel(uint32_t n_probes, + const IdxT* cluster_offsets, // [n_clusters + 1] + const uint32_t* clusters_to_probe, // [n_queries, n_probes] + uint32_t* chunk_indices, // [n_queries, n_probes] + uint32_t* n_samples // [n_queries] + ) +{ + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage shm; + + // locate the query data + clusters_to_probe += n_probes * blockIdx.x; + chunk_indices += n_probes * blockIdx.x; + + // block scan + const uint32_t n_probes_aligned = Pow2::roundUp(n_probes); + uint32_t total = 0; + for (uint32_t probe_ix = threadIdx.x; probe_ix < n_probes_aligned; probe_ix += BlockDim) { + auto label = probe_ix < n_probes ? clusters_to_probe[probe_ix] : 0u; + auto chunk = probe_ix < n_probes + ? static_cast(cluster_offsets[label + 1] - cluster_offsets[label]) + : 0u; + if (threadIdx.x == 0) { chunk += total; } + block_scan(shm).InclusiveSum(chunk, chunk, total); + __syncthreads(); + if (probe_ix < n_probes) { chunk_indices[probe_ix] = chunk; } + } + // save the total size + if (threadIdx.x == 0) { n_samples[blockIdx.x] = total; } +} + +template +struct calc_chunk_indices { + public: + struct configured { + void* kernel; + dim3 block_dim; + dim3 grid_dim; + uint32_t n_probes; + + void operator()(const IdxT* cluster_offsets, + const uint32_t* clusters_to_probe, + uint32_t* chunk_indices, + uint32_t* n_samples, + rmm::cuda_stream_view stream) + { + void* args[] = // NOLINT + {&n_probes, &cluster_offsets, &clusters_to_probe, &chunk_indices, &n_samples}; + RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, args, 0, stream)); + } + }; + + static auto configure(uint32_t n_probes, uint32_t n_queries) -> configured + { + return try_block_dim<1024>(n_probes, n_queries); + } + + private: + template + static auto try_block_dim(uint32_t n_probes, uint32_t n_queries) -> configured + { + if constexpr (BlockDim >= WarpSize * 2) { + if (BlockDim >= n_probes * 2) { return try_block_dim<(BlockDim / 2)>(n_probes, n_queries); } + } + return {reinterpret_cast(calc_chunk_indices_kernel), + dim3(BlockDim, 1, 1), + dim3(n_queries, 1, 1), + n_probes}; + } +}; + +/** + * Look up the dataset index that corresponds to a sample index. + * + * Each query vector was compared to all the vectors from n_probes clusters, and sample_ix is one of + * such vector. This function looks up which cluster sample_ix belongs to, and returns the original + * dataset index for that vector. + * + * @return whether the input index is in a valid range + * (the opposite can happen if there is not enough data to output in the selected clusters). + */ +template +__device__ auto find_db_row(IdxT& x, // NOLINT + uint32_t n_probes, + const IdxT* cluster_offsets, // [n_clusters + 1,] + const uint32_t* cluster_labels, // [n_probes,] + const uint32_t* chunk_indices // [n_probes,] + ) -> bool +{ + uint32_t ix_min = 0; + uint32_t ix_max = n_probes; + do { + uint32_t i = (ix_min + ix_max) / 2; + if (IdxT(chunk_indices[i]) < x) { + ix_min = i + 1; + } else { + ix_max = i; + } + } while (ix_min < ix_max); + if (ix_min == n_probes) { return false; } + if (ix_min > 0) { x -= chunk_indices[ix_min - 1]; } + x += cluster_offsets[cluster_labels[ix_min]]; + return true; +} + +template +__launch_bounds__(BlockDim) __global__ + void postprocess_neighbors_kernel(IdxT* neighbors, // [n_queries, topk] + const IdxT* db_indices, // [n_rows] + const IdxT* cluster_offsets, // [n_clusters + 1] + const uint32_t* clusters_to_probe, // [n_queries, n_probes] + const uint32_t* chunk_indices, // [n_queries, n_probes] + uint32_t n_queries, + uint32_t n_probes, + uint32_t topk) +{ + uint64_t i = threadIdx.x + BlockDim * uint64_t(blockIdx.x); + uint32_t query_ix = i / uint64_t(topk); + if (query_ix >= n_queries) { return; } + uint32_t k = i % uint64_t(topk); + neighbors += query_ix * topk; + IdxT data_ix = neighbors[k]; + // backtrace the index if we don't have local top-k + bool valid = true; + if (n_probes > 0) { + valid = find_db_row(data_ix, + n_probes, + cluster_offsets, + clusters_to_probe + n_probes * query_ix, + chunk_indices + n_probes * query_ix); + } + neighbors[k] = valid ? db_indices[data_ix] : std::numeric_limits::max(); +} + +/** + * Transform found neighbor indices into the corresponding database indices + * (as stored in index.indices()). + * + * When the main kernel runs with a fused top-k (`manage_local_topk == true`), this function simply + * fetches the index values by the returned row ids. Otherwise, the found neighors require extra + * pre-processing (performed by `find_db_row`). + */ +template +void postprocess_neighbors(IdxT* neighbors, // [n_queries, topk] + bool manage_local_topk, + const IdxT* db_indices, // [n_rows] + const IdxT* cluster_offsets, // [n_clusters + 1] + const uint32_t* clusters_to_probe, // [n_queries, n_probes] + const uint32_t* chunk_indices, // [n_queries, n_probes] + uint32_t n_queries, + uint32_t n_probes, + uint32_t topk, + rmm::cuda_stream_view stream) +{ + constexpr int kPNThreads = 256; + const int pn_blocks = raft::div_rounding_up_unsafe(n_queries * topk, kPNThreads); + postprocess_neighbors_kernel + <<>>(neighbors, + db_indices, + cluster_offsets, + clusters_to_probe, + chunk_indices, + n_queries, + manage_local_topk ? 0u : n_probes, + topk); +} + +/** + * Post-process the scores depending on the metric type; + * translate the element type if necessary. + */ +template +void postprocess_distances(float* out, // [n_queries, topk] + const ScoreT* in, // [n_queries, topk] + distance::DistanceType metric, + uint32_t n_queries, + uint32_t topk, + float scaling_factor, + rmm::cuda_stream_view stream) +{ + size_t len = size_t(n_queries) * size_t(topk); + switch (metric) { + case distance::DistanceType::L2Unexpanded: + case distance::DistanceType::L2Expanded: { + linalg::unaryOp( + out, + in, + len, + [scaling_factor] __device__(ScoreT x) -> float { + return scaling_factor * scaling_factor * float(x); + }, + stream); + } break; + case distance::DistanceType::L2SqrtUnexpanded: + case distance::DistanceType::L2SqrtExpanded: { + linalg::unaryOp( + out, + in, + len, + [scaling_factor] __device__(ScoreT x) -> float { return scaling_factor * sqrtf(float(x)); }, + stream); + } break; + case distance::DistanceType::InnerProduct: { + linalg::unaryOp( + out, + in, + len, + [scaling_factor] __device__(ScoreT x) -> float { + return -scaling_factor * scaling_factor * float(x); + }, + stream); + } break; + default: RAFT_FAIL("Unexpected metric."); + } +} + +/** + * @brief Compute the similarity score between a vector from `pq_dataset` and a query vector. + * + * @tparam OpT an unsigned integer type that is used for bit operations on multiple PQ codes + * at once; it's selected to maximize throughput while matching criteria: + * 1. `pq_bits * vec_len % 8 * sizeof(OpT) == 0`. + * 2. `pq_dim % vec_len == 0` + * + * @tparam LutT type of the elements in the lookup table. + * + * @param pq_bits The bit length of an encoded vector element after compression by PQ + * @param vec_len == 8 * sizeof(OpT) / gcd(8 * sizeof(OpT), pq_bits) + * @param pq_dim + * @param[in] pq_code_ptr + * a device pointer to the dataset at the indexed position (`pq_dim * pq_bits` bits-wide) + * @param[in] lut_scores + * a device or shared memory pointer to the lookup table [pq_dim, pq_book_size] + * + * @return the score for the entry `data_ix` in the `pq_dataset`. + */ +template +__device__ auto ivfpq_compute_score( + uint32_t pq_bits, uint32_t vec_len, uint32_t pq_dim, const OpT* pq_head, const LutT* lut_scores) + -> float +{ + float score = 0.0; + constexpr uint32_t kBitsTotal = 8 * sizeof(OpT); + for (; pq_dim > 0; pq_dim -= vec_len) { + OpT pq_code = pq_head[0]; + pq_head++; + auto bits_left = kBitsTotal; + for (uint32_t k = 0; k < vec_len; k++) { + uint8_t code = pq_code; + if (bits_left > pq_bits) { + pq_code >>= pq_bits; + bits_left -= pq_bits; + } else { + if (k < vec_len - 1) { + pq_code = pq_head[0]; + pq_head++; + } + code |= (pq_code << bits_left); + pq_code >>= (pq_bits - bits_left); + bits_left += (kBitsTotal - pq_bits); + } + code &= (1 << pq_bits) - 1; + score += float(lut_scores[code]); + lut_scores += (1 << pq_bits); + } + } + return score; +} + +template +struct dummy_block_sort_t { + using queue_t = topk::warp_sort_immediate; + __device__ dummy_block_sort_t(int k, uint8_t* smem_buf){}; +}; + +template +struct pq_block_sort { + using type = topk::block_sort; +}; + +template +struct pq_block_sort<0, T, IdxT> : dummy_block_sort_t { + using type = dummy_block_sort_t; +}; + +template +using block_sort_t = typename pq_block_sort::type; + +/** + * The main kernel that computes similarity scores across multiple queries and probes. + * When `Capacity > 0`, it also selects top K candidates for each query and probe + * (which need to be merged across probes afterwards). + * + * Each block processes a (query, probe) pair: it calculates the distance between the single query + * vector and all the dataset vector in the cluster that we are probing. + * + * @tparam OpT is a carrier integer type selected to maximize throughput; + * Used solely in `ivfpq_compute_score`; + * @tparam IdxT + * The type of data indices + * @tparam OutT + * The output type - distances. + * @tparam LutT + * The lookup table element type (lut_scores). + * @tparam Capacity + * Power-of-two; the maximum possible `k` in top-k. Value zero disables fused top-k search. + * @tparam PrecompBaseDiff + * Defines whether we should precompute part of the distance and keep it in shared memory + * before the main part (score calculation) to increase memory usage efficiency in the latter. + * For L2, this is the distance between the query and the cluster center. + * @tparam EnableSMemLut + * Defines whether to use the shared memory for the lookup table (`lut_scores`). + * Setting this to `false` allows to reduce the shared memory usage (and maximum data dim) + * at the cost of reducing global memory reading throughput. + * + * @param n_rows the number of records in the dataset + * @param dim the dimensionality of the data (NB: after rotation transform, i.e. `index.rot_dim()`). + * @param n_probes the number of clusters to search for each query + * @param pq_bits the bit length of an encoded vector element after compression by PQ + * (NB: pq_book_size = 1 << pq_bits). + * @param pq_dim + * The dimensionality of an encoded vector after compression by PQ. + * @param n_queries the number of queries. + * @param metric the distance type. + * @param codebook_kind Defines the way PQ codebooks have been trained. + * @param topk the `k` in the select top-k. + * @param cluster_centers + * The device pointer to the cluster centers in the original space (NB: after rotation) + * [n_clusters, dim]. + * @param pq_centers + * The device pointer to the cluster centers in the PQ space + * [pq_dim, pq_book_size, pq_len] or [n_clusters, pq_book_size, pq_len,]. + * @param pq_dataset + * The device pointer to the PQ index (data) [n_rows, pq_dim * pq_bits / 8]. + * @param cluster_offsets + * The device pointer to the cluster offsets [n_clusters + 1]. + * @param cluster_labels + * The device pointer to the labels (clusters) for each query and probe [n_queries, n_probes]. + * @param _chunk_indices + * The device pointer to the data offsets for each query and probe [n_queries, n_probes]. + * @param queries + * The device pointer to the queries (NB: after rotation) [n_queries, dim]. + * @param index_list + * An optional device pointer to the enforced order of search [n_queries, n_probes]. + * One can pass reordered indices here to try to improve data reading locality. + * @param lut_scores + * The device pointer for storing the lookup table globally [gridDim.x, pq_dim << pq_bits]. + * Ignored when `EnableSMemLut == true`. + * @param _out_scores + * The device pointer to the output scores + * [n_queries, max_samples] or [n_queries, n_probes, topk]. + * @param _out_indices + * The device pointer to the output indices [n_queries, n_probes, topk]. + * Ignored when `Capacity == 0`. + */ +template +__launch_bounds__(1024) __global__ + void ivfpq_compute_similarity_kernel(uint32_t n_rows, + uint32_t dim, + uint32_t n_probes, + uint32_t pq_bits, + uint32_t pq_dim, + uint32_t n_queries, + distance::DistanceType metric, + codebook_gen codebook_kind, + uint32_t topk, + const float* cluster_centers, + const float* pq_centers, + const uint8_t* pq_dataset, + const IdxT* cluster_offsets, + const uint32_t* cluster_labels, + const uint32_t* _chunk_indices, + const float* queries, + const uint32_t* index_list, + LutT* lut_scores, + OutT* _out_scores, + IdxT* _out_indices) +{ + /* Shared memory: + + * lut_scores: lookup table (LUT) of size = `pq_dim << pq_bits` (when EnableSMemLut) + * base_diff: size = dim (which is equal to `pq_dim * pq_len`) + * topk::block_sort: some amount of shared memory, but overlaps with the rest: + block_sort only needs shared memory for `.done()` operation, which can come very last. + */ + extern __shared__ __align__(256) uint8_t smem_buf[]; // NOLINT + constexpr bool kManageLocalTopK = Capacity > 0; + constexpr uint32_t kOpBits = 8 * sizeof(OpT); + + const uint32_t pq_len = dim / pq_dim; + const uint32_t vec_len = kOpBits / gcd(kOpBits, pq_bits); + + if constexpr (EnableSMemLut) { + lut_scores = reinterpret_cast(smem_buf); + } else { + lut_scores += (pq_dim << pq_bits) * blockIdx.x; + } + + float* base_diff = nullptr; + if constexpr (PrecompBaseDiff) { + if constexpr (EnableSMemLut) { + base_diff = reinterpret_cast(lut_scores + (pq_dim << pq_bits)); + } else { + base_diff = reinterpret_cast(smem_buf); + } + } + + for (int ib = blockIdx.x; ib < n_queries * n_probes; ib += gridDim.x) { + uint32_t query_ix; + uint32_t probe_ix; + if (index_list == nullptr) { + query_ix = ib % n_queries; + probe_ix = ib / n_queries; + } else { + query_ix = index_list[ib] / n_probes; + probe_ix = index_list[ib] % n_probes; + } + if (query_ix >= n_queries || probe_ix >= n_probes) continue; + + const uint32_t* chunk_indices = _chunk_indices + (n_probes * query_ix); + const float* query = queries + (dim * query_ix); + OutT* out_scores; + IdxT* out_indices = nullptr; + if constexpr (kManageLocalTopK) { + // Store topk calculated distances to out_scores (and its indices to out_indices) + out_scores = _out_scores + topk * (probe_ix + (n_probes * query_ix)); + out_indices = _out_indices + topk * (probe_ix + (n_probes * query_ix)); + } else { + // Store all calculated distances to out_scores + auto max_samples = Pow2<128>::roundUp(cluster_offsets[n_probes]); + out_scores = _out_scores + max_samples * query_ix; + } + uint32_t label = cluster_labels[n_probes * query_ix + probe_ix]; + const float* cluster_center = cluster_centers + (dim * label); + const float* pq_center; + if (codebook_kind == codebook_gen::PER_SUBSPACE) { + pq_center = pq_centers; + } else { + pq_center = pq_centers + (pq_len << pq_bits) * label; + } + + if constexpr (PrecompBaseDiff) { + // Reduce computational complexity by pre-computing the difference + // between the cluster centroid and the query. + for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) { + base_diff[i] = query[i] - cluster_center[i]; + } + __syncthreads(); + } + + // Create a lookup table + // For each subspace, the lookup table stores the distance between the actual query vector + // (projected into the subspace) and all possible pq vectors in that subspace. + for (uint32_t i = threadIdx.x; i < (pq_dim << pq_bits); i += blockDim.x) { + uint32_t i_pq = i >> pq_bits; + uint32_t i_code = codebook_kind == codebook_gen::PER_CLUSTER ? i & ((1 << pq_bits) - 1) : i; + float score = 0.0; + switch (metric) { + case distance::DistanceType::L2Expanded: { + for (uint32_t j = 0; j < pq_len; j++) { + uint32_t k = j + (pq_len * i_pq); + float diff; + if constexpr (PrecompBaseDiff) { + diff = base_diff[k]; + } else { + diff = query[k] - cluster_center[k]; + } + diff -= pq_center[j + pq_len * i_code]; + score += diff * diff; + } + } break; + case distance::DistanceType::InnerProduct: { + for (uint32_t j = 0; j < pq_len; j++) { + uint32_t k = j + (pq_len * i_pq); + score += query[k] * (cluster_center[k] + pq_center[j + pq_len * i_code]); + } + } break; + } + lut_scores[i] = LutT(score); + } + + uint32_t sample_offset = 0; + if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; } + uint32_t n_samples = chunk_indices[probe_ix] - sample_offset; + uint32_t n_samples32 = Pow2<32>::roundUp(n_samples); + IdxT cluster_offset = cluster_offsets[label]; + + using local_topk_t = block_sort_t; + local_topk_t block_topk(topk, smem_buf); + + // Ensure lut_scores is written by all threads before using it in ivfpq_compute_score + __threadfence_block(); + __syncthreads(); + + // Compute a distance for each sample + const uint32_t pq_line_width = pq_dim * pq_bits / 8; + for (uint32_t i = threadIdx.x; i < n_samples32; i += blockDim.x) { + OutT score = local_topk_t::queue_t::kDummy; + if (i < n_samples) { + auto pq_ptr = + reinterpret_cast(pq_dataset + uint64_t(pq_line_width) * (cluster_offset + i)); + float fscore = ivfpq_compute_score(pq_bits, vec_len, pq_dim, pq_ptr, lut_scores); + switch (metric) { + // For similarity metrics, + // we negate the scores as we hardcoded select-topk to always take the minimum + case distance::DistanceType::InnerProduct: fscore = -fscore; break; + default: break; + } + if (fscore < float(score)) { score = OutT{fscore}; } + } + if constexpr (kManageLocalTopK) { + block_topk.add(score, cluster_offset + i); + } else { + if (i < n_samples) { out_scores[i + sample_offset] = score; } + } + } + __syncthreads(); + if constexpr (kManageLocalTopK) { + // sync threads before and after the topk merging operation, because we reuse smem_buf + block_topk.done(); + block_topk.store(out_scores, out_indices); + __syncthreads(); + } else { + // fill in the rest of the out_scores with dummy values + uint32_t max_samples = uint32_t(Pow2<128>::roundUp(cluster_offsets[n_probes])); + if (probe_ix + 1 == n_probes) { + for (uint32_t i = threadIdx.x + sample_offset + n_samples; i < max_samples; + i += blockDim.x) { + out_scores[i] = local_topk_t::queue_t::kDummy; + } + } + } + } +} + +/** + * This structure selects configurable template parameters (instance) based on + * the search/index parameters at runtime. + * + * This is done by means of recusively iterating through a small set of possible + * values for every parameter. + */ +template +struct ivfpq_compute_similarity { + using kernel_t = void (*)(uint32_t, + uint32_t, + uint32_t, + uint32_t, + uint32_t, + uint32_t, + distance::DistanceType, + codebook_gen, + uint32_t, + const float*, + const float*, + const uint8_t*, + const IdxT*, + const uint32_t*, + const uint32_t*, + const float*, + const uint32_t*, + LutT*, + OutT*, + IdxT*); + + template + struct configured { + public: + /** + * Select a proper kernel instance based on the runtime parameters. + * + * @param pq_bits + * @param pq_dim + * @param k_max + */ + static auto kernel(uint32_t pq_bits, uint32_t pq_dim, uint32_t k_max) -> kernel_t + { + return kernel_base(pq_bits, pq_dim, k_max); + } + + private: + template + static auto kernel_try_capacity(uint32_t k_max) -> kernel_t + { + if constexpr (Capacity > 0) { + if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity(k_max); } + } + if constexpr (Capacity > 32) { + if (k_max * 2 <= Capacity) { return kernel_try_capacity(k_max); } + } + return ivfpq_compute_similarity_kernel; + } + + static auto kernel_base(uint32_t pq_bits, uint32_t pq_dim, uint32_t k_max) -> kernel_t + { + switch (gcd(pq_bits * pq_dim, 64)) { + case 64: return kernel_try_capacity(k_max); + case 32: return kernel_try_capacity(k_max); + case 16: return kernel_try_capacity(k_max); + case 8: return kernel_try_capacity(k_max); + default: + RAFT_FAIL("`pq_bits * pq_dim` must be a multiple of 8 (pq_bits = %u, pq_dim = %u).", + pq_bits, + pq_dim); + } + } + }; + + struct selected { + void* kernel; + dim3 grid_dim; + dim3 block_dim; + size_t smem_size; + size_t device_lut_size; + + template + void operator()(rmm::cuda_stream_view stream, Args... args) + { + void* xs[] = {&args...}; // NOLINT + RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, xs, smem_size, stream)); + } + }; + + /** + * Use heuristics to choose an optimal instance of the search kernel. + * It selects among a few kernel variants (with/out using shared mem for + * lookup tables / precomputed distances) and tries to choose the block size + * to maximize kernel occupancy. + * + * @param manage_local_topk + * whether use the fused calculate+select or just calculate the distances for each + * query and probed cluster. + * + */ + static inline auto select(bool manage_local_topk, + uint32_t pq_bits, + uint32_t pq_dim, + uint32_t rot_dim, + uint32_t preferred_thread_block_size, + uint32_t n_queries, + uint32_t n_probes, + uint32_t topk) -> selected + { + using conf_fast = configured; + using conf_no_basediff = configured; + using conf_no_smem_lut = configured; + + kernel_t kernel_fast = conf_fast::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u); + kernel_t kernel_no_basediff = + conf_no_basediff::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u); + kernel_t kernel_no_smem_lut = + conf_no_smem_lut::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u); + + const size_t smem_threshold = 48 * 1024; + size_t smem_size = sizeof(LutT) * (pq_dim << pq_bits); + size_t smem_size_base_diff = sizeof(float) * rot_dim; + + uint32_t n_blocks = n_queries * n_probes; + uint32_t n_threads = 1024; + // preferred_thread_block_size == 0 means using auto thread block size calculation mode + if (preferred_thread_block_size == 0) { + const uint32_t thread_min = 256; + int cur_dev; + cudaDeviceProp dev_props; + RAFT_CUDA_TRY(cudaGetDevice(&cur_dev)); + RAFT_CUDA_TRY(cudaGetDeviceProperties(&dev_props, cur_dev)); + while (n_threads > thread_min) { + if (n_blocks < uint32_t(getMultiProcessorCount() * (1024 / (n_threads / 2)))) { break; } + if (dev_props.sharedMemPerMultiprocessor * 2 / 3 < smem_size * (1024 / (n_threads / 2))) { + break; + } + n_threads /= 2; + } + } else { + n_threads = preferred_thread_block_size; + } + size_t smem_size_local_topk = + manage_local_topk + ? topk::template calc_smem_size_for_block_wide(n_threads / WarpSize, topk) + : 0; + smem_size = max(smem_size, smem_size_local_topk); + + kernel_t kernel = kernel_no_basediff; + + bool kernel_no_basediff_available = true; + bool use_smem_lut = true; + if (smem_size > smem_threshold) { + cudaError_t cuda_status = cudaFuncSetAttribute( + kernel_no_basediff, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); + if (cuda_status != cudaSuccess) { + RAFT_EXPECTS( + cuda_status == cudaGetLastError(), + "Tried to reset the expected cuda error code, but it didn't match the expectation"); + kernel_no_basediff_available = false; + + // Use "kernel_no_smem_lut" which just uses small amount of shared memory. + RAFT_LOG_DEBUG( + "Non-shared-mem look-up table kernel is selected, because it wouldn't fit shmem " + "required: " + "%zu bytes)", + smem_size); + kernel = kernel_no_smem_lut; + use_smem_lut = false; + n_threads = 1024; + smem_size_local_topk = + manage_local_topk + ? topk::template calc_smem_size_for_block_wide(n_threads / WarpSize, topk) + : 0; + smem_size = max(smem_size_base_diff, smem_size_local_topk); + n_blocks = getMultiProcessorCount(); + } + } + if (kernel_no_basediff_available) { + bool kernel_fast_available = true; + if (smem_size + smem_size_base_diff > smem_threshold) { + cudaError_t cuda_status = cudaFuncSetAttribute(kernel_fast, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size + smem_size_base_diff); + if (cuda_status != cudaSuccess) { + RAFT_EXPECTS( + cuda_status == cudaGetLastError(), + "Tried to reset the expected cuda error code, but it didn't match the expectation"); + kernel_fast_available = false; + RAFT_LOG_DEBUG( + "No-precomputed-basediff kernel is selected, because the basediff wouldn't fit (shmem " + "required: %zu bytes)", + smem_size + smem_size_base_diff); + } + } + if (kernel_fast_available) { + int kernel_no_basediff_n_blocks = 0; + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &kernel_no_basediff_n_blocks, kernel_no_basediff, n_threads, smem_size)); + + int kernel_fast_n_blocks = 0; + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &kernel_fast_n_blocks, kernel_fast, n_threads, smem_size + smem_size_base_diff)); + + // Use "kernel_fast" only if GPU occupancy does not drop + if (kernel_no_basediff_n_blocks == kernel_fast_n_blocks) { + kernel = kernel_fast; + smem_size += smem_size_base_diff; + } + } + } + + uint32_t device_lut_size = use_smem_lut ? 0u : n_blocks * (pq_dim << pq_bits); + return {reinterpret_cast(kernel), + dim3(n_blocks, 1, 1), + dim3(n_threads, 1, 1), + smem_size, + device_lut_size}; + } +}; + +/** + * The "main part" of the search, which assumes that outer-level `search` has already: + * + * 1. computed the closest clusters to probe (`clusters_to_probe`); + * 2. transformed input queries into the rotated space (rot_dim); + * 3. split the query batch into smaller chunks, so that the device workspace + * is guaranteed to fit into GPU memory. + */ +template +void ivfpq_search_worker(const handle_t& handle, + const index& index, + uint32_t max_samples, + uint32_t n_probes, + uint32_t topK, + uint32_t preferred_thread_block_size, + uint32_t n_queries, + const uint32_t* clusters_to_probe, // [n_queries, n_probes] + const float* query, // [n_queries, rot_dim] + IdxT* neighbors, // [n_queries, topK] + float* distances, // [n_queries, topK] + float scaling_factor, + rmm::mr::device_memory_resource* mr) +{ + auto stream = handle.get_stream(); + + auto pq_centers = index.pq_centers().data_handle(); + auto pq_dataset = index.pq_dataset().data_handle(); + auto data_indices = index.indices().data_handle(); + auto cluster_centers = index.centers_rot().data_handle(); + auto cluster_offsets = index.list_offsets().data_handle(); + + bool manage_local_topk = topK <= kMaxCapacity // depth is not too large + && n_probes >= 16 // not too few clusters looked up + && + n_queries * n_probes >= 256 // overall amount of work is not too small + ; + auto topk_len = manage_local_topk ? n_probes * topK : max_samples; + if (manage_local_topk) { + RAFT_LOG_DEBUG("Fused version of the search kernel is selected (manage_local_topk == true)"); + } else { + RAFT_LOG_DEBUG( + "Non-fused version of the search kernel is selected (manage_local_topk == false)"); + } + + rmm::device_uvector index_list_sorted_buf(0, stream, mr); + uint32_t* index_list_sorted = nullptr; + rmm::device_uvector num_samples(n_queries, stream, mr); + rmm::device_uvector chunk_index(n_queries * n_probes, stream, mr); + // [maxBatchSize, max_samples] or [maxBatchSize, n_probes, topk] + rmm::device_uvector distances_buf(n_queries * topk_len, stream, mr); + rmm::device_uvector neighbors_buf(0, stream, mr); + IdxT* neighbors_ptr = nullptr; + if (manage_local_topk) { + neighbors_buf.resize(n_queries * topk_len, stream); + neighbors_ptr = neighbors_buf.data(); + } + + calc_chunk_indices::configure(n_probes, n_queries)( + cluster_offsets, clusters_to_probe, chunk_index.data(), num_samples.data(), stream); + + if (n_queries * n_probes > 256) { + // Sorting index by cluster number (label). + // The goal is to incrase the L2 cache hit rate to read the vectors + // of a cluster by processing the cluster at the same time as much as + // possible. + index_list_sorted_buf.resize(n_queries * n_probes, stream); + rmm::device_uvector index_list_buf(n_queries * n_probes, stream, mr); + rmm::device_uvector cluster_labels_out(n_queries * n_probes, stream, mr); + auto index_list = index_list_buf.data(); + index_list_sorted = index_list_sorted_buf.data(); + thrust::sequence(handle.get_thrust_policy(), + thrust::device_pointer_cast(index_list), + thrust::device_pointer_cast(index_list + n_queries * n_probes)); + + int begin_bit = 0; + int end_bit = sizeof(uint32_t) * 8; + size_t cub_workspace_size = 0; + cub::DeviceRadixSort::SortPairs(nullptr, + cub_workspace_size, + clusters_to_probe, + cluster_labels_out.data(), + index_list, + index_list_sorted, + n_queries * n_probes, + begin_bit, + end_bit, + stream); + rmm::device_buffer cub_workspace(cub_workspace_size, stream, mr); + cub::DeviceRadixSort::SortPairs(cub_workspace.data(), + cub_workspace_size, + clusters_to_probe, + cluster_labels_out.data(), + index_list, + index_list_sorted, + n_queries * n_probes, + begin_bit, + end_bit, + stream); + } + + // select and run the main search kernel + auto search_instance = + ivfpq_compute_similarity::select(manage_local_topk, + index.pq_bits(), + index.pq_dim(), + index.rot_dim(), + preferred_thread_block_size, + n_queries, + n_probes, + topK); + + rmm::device_uvector device_lut(search_instance.device_lut_size, stream, mr); + search_instance(stream, + index.size(), + index.rot_dim(), + n_probes, + index.pq_bits(), + index.pq_dim(), + n_queries, + index.metric(), + index.codebook_kind(), + topK, + cluster_centers, + pq_centers, + pq_dataset, + cluster_offsets, + clusters_to_probe, + chunk_index.data(), + query, + index_list_sorted, + device_lut.data(), + distances_buf.data(), + neighbors_ptr); + + // Select topk vectors for each query + rmm::device_uvector topk_dists(n_queries * topK, stream, mr); + select_topk(distances_buf.data(), + neighbors_ptr, + n_queries, + topk_len, + topK, + topk_dists.data(), + neighbors, + true, + stream, + mr); + + // Postprocessing + postprocess_distances( + distances, topk_dists.data(), index.metric(), n_queries, topK, scaling_factor, stream); + postprocess_neighbors(neighbors, + manage_local_topk, + data_indices, + cluster_offsets, + clusters_to_probe, + chunk_index.data(), + n_queries, + n_probes, + topK, + stream); +} + +/** + * This structure helps selecting a proper instance of the worker search function, + * which contains a few template parameters. + */ +template +struct ivfpq_search { + public: + using fun_t = void (*)(const handle_t&, + const ivf_pq::index&, + uint32_t, + uint32_t, + uint32_t, + uint32_t, + uint32_t, + const uint32_t*, + const float*, + IdxT*, + float*, + float, + rmm::mr::device_memory_resource*); + + /** + * Select an instance of the ivf-pq search function based on search tuning parameters, + * such as the look-up data type or the internal score type. + */ + static auto fun(const search_params& params, distance::DistanceType metric) -> fun_t + { + return fun_try_score_t(params, metric); + } + + private: + template + static auto fun_try_lut_t(const search_params& params, distance::DistanceType metric) -> fun_t + { + bool signed_metric = false; + switch (metric) { + case raft::distance::DistanceType::InnerProduct: signed_metric = true; break; + default: break; + } + + switch (params.lut_dtype) { + case CUDA_R_32F: return ivfpq_search_worker; + case CUDA_R_16F: return ivfpq_search_worker; + case CUDA_R_8U: + case CUDA_R_8I: + if (signed_metric) { + return ivfpq_search_worker, IdxT>; + } else { + return ivfpq_search_worker, IdxT>; + } + default: RAFT_FAIL("Unexpected lut_dtype (%d)", int(params.lut_dtype)); + } + } + + static auto fun_try_score_t(const search_params& params, distance::DistanceType metric) -> fun_t + { + switch (params.internal_distance_dtype) { + case CUDA_R_32F: return fun_try_lut_t(params, metric); + case CUDA_R_16F: return fun_try_lut_t(params, metric); + default: + RAFT_FAIL("Unexpected internal_distance_dtype (%d)", int(params.internal_distance_dtype)); + } + } +}; + +/** + * A heuristic for bounding the number of queries per batch, to improve GPU utilization. + * (based on the number of SMs and the work size). + * + * @param n_queries number of queries hoped to be processed at once. + * (maximum value for the returned batch size) + * + * @return maximum recommended batch size. + */ +inline auto get_max_batch_size(uint32_t n_queries) -> uint32_t +{ + uint32_t max_batch_size = n_queries; + uint32_t n_ctas_total = getMultiProcessorCount() * 2; + uint32_t n_ctas_total_per_batch = n_ctas_total / max_batch_size; + float utilization = float(n_ctas_total_per_batch * max_batch_size) / n_ctas_total; + if (n_ctas_total_per_batch > 1 || (n_ctas_total_per_batch == 1 && utilization < 0.6)) { + uint32_t n_ctas_total_per_batch_1 = n_ctas_total_per_batch + 1; + uint32_t max_batch_size_1 = n_ctas_total / n_ctas_total_per_batch_1; + float utilization_1 = float(n_ctas_total_per_batch_1 * max_batch_size_1) / n_ctas_total; + if (utilization < utilization_1) { max_batch_size = max_batch_size_1; } + } + return max_batch_size; +} + +/** See raft::spatial::knn::ivf_pq::search docs */ +template +inline void search(const handle_t& handle, + const search_params& params, + const index& index, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr = nullptr) +{ + static_assert(std::is_same_v || std::is_same_v || std::is_same_v, + "Unsupported element type."); + common::nvtx::range fun_scope( + "ivf_pq::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim()); + + RAFT_EXPECTS( + params.internal_distance_dtype == CUDA_R_16F || params.internal_distance_dtype == CUDA_R_32F, + "internal_distance_dtype must be either CUDA_R_16F or CUDA_R_32F"); + RAFT_EXPECTS(params.lut_dtype == CUDA_R_16F || params.lut_dtype == CUDA_R_32F || + params.lut_dtype == CUDA_R_8U, + "lut_dtype must be CUDA_R_16F, CUDA_R_32F or CUDA_R_8U"); + RAFT_EXPECTS( + params.preferred_thread_block_size == 256 || params.preferred_thread_block_size == 512 || + params.preferred_thread_block_size == 1024 || params.preferred_thread_block_size == 0, + "preferred_thread_block_size must be 0, 256, 512 or 1024, but %u is given.", + params.preferred_thread_block_size); + RAFT_EXPECTS(k > 0, "parameter `k` in top-k must be positive."); + RAFT_EXPECTS( + k <= index.size(), + "parameter `k` (%u) in top-k must not be larger that the total size of the index (%zu)", + k, + static_cast(index.size())); + RAFT_EXPECTS(params.n_probes > 0, + "n_probes (number of clusters to probe in the search) must be positive."); + + switch (utils::check_pointer_residency(queries, neighbors, distances)) { + case utils::pointer_residency::device_only: + case utils::pointer_residency::host_and_device: break; + default: RAFT_FAIL("all pointers must be accessible from the device."); + } + + auto stream = handle.get_stream(); + + auto dim = index.dim(); + auto dim_ext = index.dim_ext(); + auto n_probes = std::min(params.n_probes, index.n_lists()); + + IdxT max_samples = 0; + { + IdxT offset_worst_case = 0; + auto cluster_offsets = index.list_offsets().data_handle(); + copy(&max_samples, cluster_offsets + n_probes, 1, stream); + if (n_probes < index.n_nonempty_lists()) { + copy(&offset_worst_case, cluster_offsets + index.n_nonempty_lists() - n_probes, 1, stream); + } + handle.sync_stream(); + max_samples = Pow2<128>::roundUp(max_samples); + IdxT min_samples = index.size() - offset_worst_case; + if (IdxT{k} > min_samples) { + RAFT_LOG_WARN( + "n_probes is too small to get top-k results reliably (n_probes: %u, k: %u, n_samples " + "(worst_case): %zu).", + n_probes, + k, + static_cast(min_samples)); + } + RAFT_EXPECTS(max_samples <= IdxT(std::numeric_limits::max()), + "The maximum sample size is too big."); + } + + auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16); + if (pool_guard) { + RAFT_LOG_DEBUG("ivf_pq::search: using pool memory resource with initial size %zu bytes", + pool_guard->pool_size()); + } + + // Maximum number of query vectors to search at the same time. + const auto max_queries = std::min(std::max(n_queries, 1), 4096); + auto max_batch_size = get_max_batch_size(max_queries); + + rmm::device_uvector float_queries(max_queries * dim_ext, stream, mr); + rmm::device_uvector rot_queries(max_queries * index.rot_dim(), stream, mr); + rmm::device_uvector clusters_to_probe(max_queries * params.n_probes, stream, mr); + + auto search_instance = ivfpq_search::fun(params, index.metric()); + + for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) { + uint32_t queries_batch = min(max_queries, n_queries - offset_q); + + select_clusters(handle, + clusters_to_probe.data(), + float_queries.data(), + queries_batch, + params.n_probes, + index.n_lists(), + dim, + dim_ext, + index.metric(), + queries + static_cast(dim) * offset_q, + index.centers().data_handle(), + mr); + + // Rotate queries + float alpha = 1.0; + float beta = 0.0; + linalg::gemm(handle, + true, + false, + index.rot_dim(), + queries_batch, + dim, + &alpha, + index.rotation_matrix().data_handle(), + dim, + float_queries.data(), + dim_ext, + &beta, + rot_queries.data(), + index.rot_dim(), + stream); + + for (uint32_t offset_b = 0; offset_b < queries_batch; offset_b += max_batch_size) { + uint32_t batch_size = min(max_batch_size, queries_batch - offset_b); + /* The distance calculation is done in the rotated/transformed space; + as long as `index.rotation_matrix()` is orthogonal, the distances and thus results are + preserved. + */ + search_instance(handle, + index, + max_samples, + params.n_probes, + k, + params.preferred_thread_block_size, + batch_size, + clusters_to_probe.data() + uint64_t(params.n_probes) * offset_b, + rot_queries.data() + uint64_t(index.rot_dim()) * offset_b, + neighbors + uint64_t(k) * (offset_q + offset_b), + distances + uint64_t(k) * (offset_q + offset_b), + utils::config::kDivisor / utils::config::kDivisor, + mr); + } + } +} + +} // namespace raft::spatial::knn::ivf_pq::detail diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 7cefeffea2..0c33c3f38f 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -16,8 +16,8 @@ #pragma once -#include -#include +#include +#include #include #include @@ -30,8 +30,8 @@ #include #include -#include -#include +#include +#include #include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/processing.cuh b/cpp/include/raft/spatial/knn/detail/processing.cuh index 79c437b020..a80c1c1935 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.cuh +++ b/cpp/include/raft/spatial/knn/detail/processing.cuh @@ -17,7 +17,7 @@ #include "processing.hpp" -#include +#include #include #include #include @@ -93,7 +93,7 @@ class CosineMetricProcessor : public MetricProcessor { data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_); } - virtual void set_num_queries(int k) { k_ = k; } + void set_num_queries(int k) override { k_ = k; } ~CosineMetricProcessor() = default; }; diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh index 010bd5aaac..239379aad5 100644 --- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh @@ -16,9 +16,9 @@ #pragma once -#include -#include #include +#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/topk.cuh b/cpp/include/raft/spatial/knn/detail/topk.cuh new file mode 100644 index 0000000000..5adf6df472 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/topk.cuh @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "topk/radix_topk.cuh" +#include "topk/warpsort_topk.cuh" + +#include +#include + +namespace raft::spatial::knn::detail { + +/** + * Select k smallest or largest key/values from each row in the input data. + * + * If you think of the input data `in_keys` as a row-major matrix with len columns and + * batch_size rows, then this function selects k smallest/largest values in each row and fills + * in the row-major matrix `out` of size (batch_size, k). + * + * @tparam T + * the type of the keys (what is being compared). + * @tparam IdxT + * the index type (what is being selected together with the keys). + * + * @param[in] in + * contiguous device array of inputs of size (len * batch_size); + * these are compared and selected. + * @param[in] in_idx + * contiguous device array of inputs of size (len * batch_size); + * typically, these are indices of the corresponding in_keys. + * @param batch_size + * number of input rows, i.e. the batch size. + * @param len + * length of a single input array (row); also sometimes referred as n_cols. + * Invariant: len >= k. + * @param k + * the number of outputs to select in each input row. + * @param[out] out + * contiguous device array of outputs of size (k * batch_size); + * the k smallest/largest values from each row of the `in_keys`. + * @param[out] out_idx + * contiguous device array of outputs of size (k * batch_size); + * the payload selected together with `out`. + * @param select_min + * whether to select k smallest (true) or largest (false) keys. + * @param stream + * @param mr an optional memory resource to use across the calls (you can provide a large enough + * memory pool here to avoid memory allocations within the call). + */ +template +void select_topk(const T* in, + const IdxT* in_idx, + size_t batch_size, + size_t len, + int k, + T* out, + IdxT* out_idx, + bool select_min, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = nullptr) +{ + if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) { + topk::warp_sort_topk( + in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr); + } else { + topk::radix_topk= 4 ? 11 : 8), 512>( + in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr); + } +} + +} // namespace raft::spatial::knn::detail diff --git a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh index 44ffe6bc50..40ac7b0b92 100644 --- a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh +++ b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace raft::spatial::knn::detail::topk { diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh index 53d88ff366..9c0f20b706 100644 --- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh +++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh @@ -18,8 +18,9 @@ #include #include -#include -#include +#include +#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh index 017678afbb..84cc072620 100644 --- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh +++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh @@ -19,8 +19,8 @@ #include "bitonic_sort.cuh" #include -#include -#include +#include +#include #include #include @@ -135,6 +135,7 @@ constexpr auto calc_capacity(int k) -> int template class warp_sort { static_assert(isPo2(Capacity)); + static_assert(std::is_default_constructible_v); public: /** @@ -158,6 +159,7 @@ class warp_sort { #pragma unroll for (int i = 0; i < kMaxArrLen; i++) { val_arr_[i] = kDummy; + idx_arr_[i] = IdxT{}; } } @@ -280,6 +282,7 @@ class warp_sort_filtered : public warp_sort { #pragma unroll for (int i = 0; i < kMaxBufLen; i++) { val_buf_[i] = kDummy; + idx_buf_[i] = IdxT{}; } } @@ -371,6 +374,7 @@ class warp_sort_immediate : public warp_sort { #pragma unroll for (int i = 0; i < kMaxArrLen; i++) { val_buf_[i] = kDummy; + idx_buf_[i] = IdxT{}; } } @@ -429,9 +433,9 @@ template