From c325213fbc0e68b5b418a03eda0a25d348248fbc Mon Sep 17 00:00:00 2001 From: Sameer Sheorey Date: Thu, 26 Dec 2024 16:02:03 -0800 Subject: [PATCH] WIP --- 3rdparty/find_dependencies.cmake | 10 +- CMakeLists.txt | 16 +- cmake/Open3DSYCLTargetSources.cmake | 6 +- cpp/open3d/core/CMakeLists.txt | 49 ++- cpp/open3d/core/Device.h | 8 + cpp/open3d/core/Indexer.h | 8 +- cpp/open3d/core/Indexer.isph | 4 +- cpp/open3d/core/ParallelFor.h | 5 + cpp/open3d/core/ParallelForSYCL.h | 63 ++++ cpp/open3d/core/SYCLContext.cpp | 51 +++- cpp/open3d/core/SYCLContext.h | 29 +- cpp/open3d/core/kernel/Arange.cpp | 6 + cpp/open3d/core/kernel/Arange.h | 7 + cpp/open3d/core/kernel/ArangeSYCL.cpp | 37 +++ cpp/open3d/core/kernel/BinaryEW.cpp | 6 + cpp/open3d/core/kernel/BinaryEW.h | 7 + cpp/open3d/core/kernel/BinaryEWSYCL.cpp | 270 +++++++++++++++++ cpp/open3d/core/kernel/IndexGetSet.cpp | 9 + cpp/open3d/core/kernel/IndexGetSet.h | 16 + cpp/open3d/core/kernel/IndexGetSetSYCL.cpp | 76 +++++ cpp/open3d/core/kernel/IndexReduction.cpp | 4 + cpp/open3d/core/kernel/IndexReduction.h | 7 + cpp/open3d/core/kernel/IndexReductionSYCL.cpp | 61 ++++ cpp/open3d/core/kernel/NonZero.cpp | 6 + cpp/open3d/core/kernel/NonZero.h | 4 + cpp/open3d/core/kernel/NonZeroSYCL.cpp | 75 +++++ cpp/open3d/core/kernel/Reduction.cpp | 12 + cpp/open3d/core/kernel/Reduction.h | 8 + cpp/open3d/core/kernel/ReductionSYCL.cpp | 164 ++++++++++ cpp/open3d/core/kernel/UnaryEW.cpp | 21 ++ cpp/open3d/core/kernel/UnaryEW.h | 9 +- cpp/open3d/core/kernel/UnaryEWCPU.cpp | 13 - cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp | 53 ++++ cpp/open3d/core/kernel/UnaryEWSYCL.cpp | 286 ++++++++++++++++-- cpp/open3d/core/linalg/AddMM.cpp | 7 + cpp/open3d/core/linalg/AddMM.h | 18 ++ cpp/open3d/core/linalg/AddMMSYCL.cpp | 49 +++ cpp/open3d/core/linalg/Inverse.cpp | 14 + cpp/open3d/core/linalg/Inverse.h | 11 +- cpp/open3d/core/linalg/InverseSYCL.cpp | 48 +++ cpp/open3d/core/linalg/LU.cpp | 28 +- cpp/open3d/core/linalg/LUImpl.h | 9 + cpp/open3d/core/linalg/LUSYCL.cpp | 42 +++ cpp/open3d/core/linalg/LeastSquares.cpp | 6 + cpp/open3d/core/linalg/LeastSquares.h | 10 + cpp/open3d/core/linalg/LeastSquaresSYCL.cpp | 46 +++ cpp/open3d/core/linalg/Matmul.cpp | 8 +- cpp/open3d/core/linalg/Matmul.h | 10 + cpp/open3d/core/linalg/MatmulSYCL.cpp | 39 +++ cpp/open3d/core/linalg/SVD.cpp | 14 +- cpp/open3d/core/linalg/SVD.h | 11 + cpp/open3d/core/linalg/SVDSYCL.cpp | 48 +++ cpp/open3d/core/linalg/Solve.cpp | 11 +- cpp/open3d/core/linalg/Solve.h | 10 + cpp/open3d/core/linalg/SolveSYCL.cpp | 44 +++ cpp/open3d/core/linalg/Tri.cpp | 18 ++ cpp/open3d/core/linalg/TriImpl.h | 11 + cpp/open3d/core/linalg/TriSYCL.cpp | 82 +++++ cpp/open3d/t/geometry/kernel/CMakeLists.txt | 2 + cpp/open3d/t/geometry/kernel/IPPImage.cpp | 1 - cpp/tests/core/CMakeLists.txt | 8 + cpp/tests/core/CoreTest.cpp | 9 +- cpp/tests/core/CoreTest.h | 4 + cpp/tests/core/Linalg.cpp | 9 +- cpp/tests/core/ParallelForSYCL.cpp | 65 ++++ cpp/tests/core/Tensor.cpp | 223 +++++++------- cpp/tests/core/TensorCheck.cpp | 17 +- cpp/tests/core/TensorFunction.cpp | 16 +- cpp/tests/core/TensorObject.cpp | 25 +- .../t/geometry/AxisAlignedBoundingBox.cpp | 12 +- cpp/tests/t/geometry/LineSet.cpp | 11 +- cpp/tests/t/geometry/OrientedBoundingBox.cpp | 12 +- cpp/tests/t/geometry/PointCloud.cpp | 11 +- cpp/tests/t/geometry/TensorMap.cpp | 9 +- cpp/tests/t/geometry/TriangleMesh.cpp | 9 +- 75 files changed, 2164 insertions(+), 279 deletions(-) create mode 100644 cpp/open3d/core/ParallelForSYCL.h create mode 100644 cpp/open3d/core/kernel/ArangeSYCL.cpp create mode 100644 cpp/open3d/core/kernel/BinaryEWSYCL.cpp create mode 100644 cpp/open3d/core/kernel/IndexGetSetSYCL.cpp create mode 100644 cpp/open3d/core/kernel/IndexReductionSYCL.cpp create mode 100644 cpp/open3d/core/kernel/NonZeroSYCL.cpp create mode 100644 cpp/open3d/core/kernel/ReductionSYCL.cpp create mode 100644 cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp create mode 100644 cpp/open3d/core/linalg/AddMMSYCL.cpp create mode 100644 cpp/open3d/core/linalg/InverseSYCL.cpp create mode 100644 cpp/open3d/core/linalg/LUSYCL.cpp create mode 100644 cpp/open3d/core/linalg/LeastSquaresSYCL.cpp create mode 100644 cpp/open3d/core/linalg/MatmulSYCL.cpp create mode 100644 cpp/open3d/core/linalg/SVDSYCL.cpp create mode 100644 cpp/open3d/core/linalg/SolveSYCL.cpp create mode 100644 cpp/open3d/core/linalg/TriSYCL.cpp create mode 100644 cpp/tests/core/ParallelForSYCL.cpp diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake index d2afbe405c9..62ea33ee1fe 100644 --- a/3rdparty/find_dependencies.cmake +++ b/3rdparty/find_dependencies.cmake @@ -1532,12 +1532,14 @@ open3d_import_3rdparty_library(3rdparty_uvatlas list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_uvatlas) +# SYCL link options are specified here. Compile options are only applied to SYCL source files and are specified in cmake/Open3DSYCLTargetSources.cmake if(BUILD_SYCL_MODULE) add_library(3rdparty_sycl INTERFACE) target_link_libraries(3rdparty_sycl INTERFACE $<$,$>>:sycl>) target_link_options(3rdparty_sycl INTERFACE - $<$,$>>:-fsycl -fsycl-targets=spir64_x86_64>) + $<$,$>>:-fsycl -fsycl-targets=intel_gpu_acm_g10>) + # $<$,$>>:-fsycl -fsycl-targets=spir64,spir64_gen>) if(NOT BUILD_SHARED_LIBS OR arg_PUBLIC) install(TARGETS 3rdparty_sycl EXPORT Open3DTargets) endif() @@ -1563,8 +1565,12 @@ if(OPEN3D_USE_ONEAPI_PACKAGES) GROUPED INCLUDE_DIRS ${MKL_INCLUDE}/ LIB_DIR ${MKL_ROOT}/lib/intel64 - LIBRARIES mkl_intel_ilp64 mkl_tbb_thread mkl_core + LIBRARIES $<$:mkl_sycl> mkl_intel_ilp64 mkl_tbb_thread mkl_core ) + if (BUILD_SYCL_MODULE) + # target_link_options(3rdparty_mkl INTERFACE "-Wl,-export-dynamic") + target_link_libraries(3rdparty_mkl INTERFACE OpenCL) + endif() # MKL definitions target_compile_options(3rdparty_mkl INTERFACE "$<$:$<$:-m64>>") target_compile_definitions(3rdparty_mkl INTERFACE "$<$:MKL_ILP64>") diff --git a/CMakeLists.txt b/CMakeLists.txt index e1bd4706288..635e43e6abf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -282,15 +282,19 @@ endif() if(BUILD_SYCL_MODULE AND NOT GLIBCXX_USE_CXX11_ABI) message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires GLIBCXX_USE_CXX11_ABI=ON") endif() -if(BUILD_SYCL_MODULE AND BUILD_TENSORFLOW_OPS) - message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_TENSORFLOW_OPS=OFF") -endif() -if(BUILD_SYCL_MODULE AND BUILD_PYTORCH_OPS) - message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_PYTORCH_OPS=OFF") -endif() +# if(BUILD_SYCL_MODULE AND BUILD_TENSORFLOW_OPS) +# message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_TENSORFLOW_OPS=OFF") +# endif() +# if(BUILD_SYCL_MODULE AND BUILD_PYTORCH_OPS) +# message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_PYTORCH_OPS=OFF") +# endif() if(BUILD_SYCL_MODULE AND BUILD_CUDA_MODULE) message(FATAL_ERROR "BUILD_SYCL_MODULE and BUILD_SYCL_MODULE cannot be on at the same time for now.") endif() +# Use LLD with icpx for faster linking +if (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") + add_link_options("-fuse-ld=lld") +endif() # Global flag to set CXX standard. # This does not affect 3rd party libraries. diff --git a/cmake/Open3DSYCLTargetSources.cmake b/cmake/Open3DSYCLTargetSources.cmake index 9b0220c10ab..baa075f38e7 100644 --- a/cmake/Open3DSYCLTargetSources.cmake +++ b/cmake/Open3DSYCLTargetSources.cmake @@ -2,7 +2,8 @@ # # When BUILD_SYCL_MODULE=ON, set SYCL-specific compile flags for the listed # source files and call target_sources(). If BUILD_SYCL_MODULE=OFF, this -# function directly calls target_sources(). +# function directly calls target_sources(). For SYCL link options, see +# 3rdparty/find_dependencies.cmake # # Note: this is not a perfect forwarding to target_sources(), as it only support # limited set of arguments. See the example usage below. @@ -31,7 +32,8 @@ function(open3d_sycl_target_sources target) if(BUILD_SYCL_MODULE) foreach(sycl_file IN LISTS arg_UNPARSED_ARGUMENTS) set_source_files_properties(${sycl_file} PROPERTIES - COMPILE_OPTIONS -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_x86_64) + COMPILE_OPTIONS "-fsycl;-fsycl-targets=intel_gpu_acm_g10") + #COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64,spir64_gen") if(arg_VERBOSE) message(STATUS "open3d_sycl_target_sources(${target}): marked ${sycl_file} as SYCL code") endif() diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt index c34cea3fc6d..210cc33058a 100644 --- a/cpp/open3d/core/CMakeLists.txt +++ b/cpp/open3d/core/CMakeLists.txt @@ -5,7 +5,6 @@ target_sources(core PRIVATE CUDAUtils.cpp Device.cpp Dtype.cpp - EigenConverter.cpp Indexer.cpp MemoryManager.cpp MemoryManagerCached.cpp @@ -23,6 +22,7 @@ target_sources(core PRIVATE # Compile regardless BUILD_SYCL_MODULE == ON or OFF. open3d_sycl_target_sources(core PRIVATE + EigenConverter.cpp SYCLUtils.cpp ) @@ -43,22 +43,14 @@ target_sources(core PRIVATE hashmap/HashSet.cpp kernel/Kernel.cpp linalg/AddMM.cpp - linalg/AddMMCPU.cpp linalg/Det.cpp linalg/Inverse.cpp - linalg/InverseCPU.cpp linalg/LeastSquares.cpp - linalg/LeastSquaresCPU.cpp linalg/LU.cpp - linalg/LUCPU.cpp linalg/Matmul.cpp - linalg/MatmulCPU.cpp linalg/Solve.cpp - linalg/SolveCPU.cpp linalg/SVD.cpp - linalg/SVDCPU.cpp linalg/Tri.cpp - linalg/TriCPU.cpp nns/FixedRadiusIndex.cpp nns/FixedRadiusSearchOps.cpp nns/KnnIndex.cpp @@ -73,21 +65,48 @@ set_target_properties(core_impl PROPERTIES CXX_VISIBILITY_PRESET "hidden") target_sources(core_impl PRIVATE kernel/Arange.cpp - kernel/ArangeCPU.cpp kernel/BinaryEW.cpp - kernel/BinaryEWCPU.cpp kernel/IndexGetSet.cpp - kernel/IndexGetSetCPU.cpp kernel/IndexReduction.cpp - kernel/IndexReductionCPU.cpp kernel/NonZero.cpp - kernel/NonZeroCPU.cpp kernel/Reduction.cpp - kernel/ReductionCPU.cpp kernel/UnaryEW.cpp + kernel/ArangeCPU.cpp + kernel/BinaryEWCPU.cpp + kernel/IndexGetSetCPU.cpp + kernel/IndexReductionCPU.cpp + kernel/NonZeroCPU.cpp + kernel/ReductionCPU.cpp kernel/UnaryEWCPU.cpp + linalg/AddMMCPU.cpp + linalg/InverseCPU.cpp + linalg/LeastSquaresCPU.cpp + linalg/LUCPU.cpp + linalg/MatmulCPU.cpp + linalg/SolveCPU.cpp + linalg/SVDCPU.cpp + linalg/TriCPU.cpp +) + +if (BUILD_SYCL_MODULE) +open3d_sycl_target_sources(core_impl PRIVATE kernel/UnaryEWSYCL.cpp + kernel/BinaryEWSYCL.cpp + kernel/ArangeSYCL.cpp + kernel/IndexGetSetSYCL.cpp + kernel/NonZeroSYCL.cpp + kernel/IndexReductionSYCL.cpp + kernel/ReductionSYCL.cpp + linalg/AddMMSYCL.cpp + linalg/InverseSYCL.cpp + linalg/LeastSquaresSYCL.cpp + linalg/LUSYCL.cpp + linalg/MatmulSYCL.cpp + linalg/SolveSYCL.cpp + linalg/SVDSYCL.cpp + linalg/TriSYCL.cpp ) +endif() if (BUILD_CUDA_MODULE) target_sources(core PRIVATE diff --git a/cpp/open3d/core/Device.h b/cpp/open3d/core/Device.h index a79d9cf646c..d215d16e55e 100644 --- a/cpp/open3d/core/Device.h +++ b/cpp/open3d/core/Device.h @@ -115,4 +115,12 @@ struct hash { return std::hash{}(device.ToString()); } }; + +template <> +struct less { + bool operator()(const open3d::core::Device& lhs, + const open3d::core::Device& rhs) const { + return lhs.ToString() < rhs.ToString(); + } +}; } // namespace std diff --git a/cpp/open3d/core/Indexer.h b/cpp/open3d/core/Indexer.h index c9eb8933d49..4fa7a61c21b 100644 --- a/cpp/open3d/core/Indexer.h +++ b/cpp/open3d/core/Indexer.h @@ -34,11 +34,11 @@ class Indexer; class IndexerIterator; // Maximum number of dimensions of TensorRef. -static constexpr int64_t MAX_DIMS = 10; +static constexpr int64_t MAX_DIMS = 4; // Maximum number of inputs of an op. // MAX_INPUTS shall be >= MAX_DIMS to support advanced indexing. -static constexpr int64_t MAX_INPUTS = 10; +static constexpr int64_t MAX_INPUTS = 4; // Maximum number of outputs of an op. This number can be increased when // necessary. @@ -110,7 +110,7 @@ struct TensorRef { TensorRef(const Tensor& t) { if (t.NumDims() > MAX_DIMS) { - utility::LogError("Tenor has too many dimensions {} > {}.", + utility::LogError("Tensor has too many dimensions {} > {}.", t.NumDims(), MAX_DIMS); } data_ptr_ = const_cast(t.GetDataPtr()); @@ -638,7 +638,7 @@ class Indexer { class IndexerIterator { public: struct Iterator { - Iterator(){}; + Iterator() {}; Iterator(const Indexer& indexer); Iterator(Iterator&& other) = default; diff --git a/cpp/open3d/core/Indexer.isph b/cpp/open3d/core/Indexer.isph index c9b63486377..af7629d3f06 100644 --- a/cpp/open3d/core/Indexer.isph +++ b/cpp/open3d/core/Indexer.isph @@ -11,11 +11,11 @@ #include "open3d/utility/Helper.isph" // Maximum number of dimensions of TensorRef. -enum { MAX_DIMS = 10 }; +enum { MAX_DIMS = 4 }; // Maximum number of inputs of an op. // MAX_INPUTS shall be >= MAX_DIMS to support advanced indexing. -enum { MAX_INPUTS = 10 }; +enum { MAX_INPUTS = 4 }; // Maximum number of outputs of an op. This number can be increased when // necessary. diff --git a/cpp/open3d/core/ParallelFor.h b/cpp/open3d/core/ParallelFor.h index 2d5bef78812..9e917789947 100644 --- a/cpp/open3d/core/ParallelFor.h +++ b/cpp/open3d/core/ParallelFor.h @@ -99,6 +99,11 @@ void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) { /// \note If you use a lambda function, capture only the required variables /// instead of all to prevent accidental race conditions. If you want the /// kernel to be used on both CPU and CUDA, capture the variables by value. +/// \note This does not dispatch to SYCL, since SYCL has extra constraints: +/// - Lambdas may capture by value only. +/// - No function pointers / virtual functions. +/// Auto dispatch to SYCL will enforce these conditions even on CPU devices. Use +/// ParallelForSYCL instead. template void ParallelFor(const Device& device, int64_t n, const func_t& func) { #ifdef __CUDACC__ diff --git a/cpp/open3d/core/ParallelForSYCL.h b/cpp/open3d/core/ParallelForSYCL.h new file mode 100644 index 00000000000..d74d4853734 --- /dev/null +++ b/cpp/open3d/core/ParallelForSYCL.h @@ -0,0 +1,63 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#pragma once + +#include +#include + +#include "open3d/core/Device.h" +#include "open3d/core/Indexer.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { + +/// Run a function in parallel with SYCL. +template +void ParallelForSYCL(const Device& device, + Indexer indexer, + FuncArgs... func_args) { + if (!device.IsSYCL()) { + utility::LogError("ParallelFor for SYCL cannot run on device {}.", + device.ToString()); + } + int64_t n = indexer.NumWorkloads(); + if (n == 0) { + return; + } + auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + /// TODO: Specify grid size based on device properties + queue.parallel_for(n, [indexer, func_args...](int64_t i) { + Functor ef(indexer, func_args...); + ef(i); + }).wait_and_throw(); +} + +/// Run a function in parallel with SYCL. +template +void ParallelForSYCL(const Device& device, + int64_t num_workloads, + FuncArgs... func_args) { + if (!device.IsSYCL()) { + utility::LogError("ParallelFor for SYCL cannot run on device {}.", + device.ToString()); + } + if (num_workloads == 0) { + return; + } + auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + /// TODO: Specify grid size based on device properties + queue.parallel_for(num_workloads, [func_args...](int64_t i) { + Functor ef(func_args...); + ef(i); + }).wait_and_throw(); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/SYCLContext.cpp b/cpp/open3d/core/SYCLContext.cpp index 30c5f6f6afc..2264b67b056 100644 --- a/cpp/open3d/core/SYCLContext.cpp +++ b/cpp/open3d/core/SYCLContext.cpp @@ -27,19 +27,44 @@ SYCLContext &SYCLContext::GetInstance() { bool SYCLContext::IsAvailable() { return devices_.size() > 0; } bool SYCLContext::IsDeviceAvailable(const Device &device) { - bool rc = false; - for (const Device &device_ : devices_) { - if (device == device_) { - rc = true; - break; - } + return devices_.find(device) != devices_.end(); +} +std::vector SYCLContext::GetAvailableSYCLDevices() { + std::vector device_vec; + for (const auto &device : devices_) { + device_vec.push_back(device.first); } - return rc; + return device_vec; } -std::vector SYCLContext::GetAvailableSYCLDevices() { return devices_; } sycl::queue SYCLContext::GetDefaultQueue(const Device &device) { - return device_to_default_queue_.at(device); + return devices_.at(device).queue; +} + +SYCLDevice::SYCLDevice(const sycl::device &sycl_device) { + namespace sid = sycl::info::device; + device = sycl_device; + queue = sycl::queue(device); + name = device.get_info(); + max_work_group_size = device.get_info(); + auto aspects = device.get_info(); + fp64 = std::find(aspects.begin(), aspects.end(), sycl::aspect::fp64) != + aspects.end(); + if (!fp64) { + utility::LogWarning( + "SYCL device {} does not support double precision. Using " + "emulation.", + name); + } + usm_device_allocations = + std::find(aspects.begin(), aspects.end(), + sycl::aspect::usm_device_allocations) != aspects.end(); + if (!usm_device_allocations) { + utility::LogWarning( + "SYCL device {} does not support USM device allocations. " + "Open3D SYCL support may not work.", + name); + } } SYCLContext::SYCLContext() { @@ -48,9 +73,7 @@ SYCLContext::SYCLContext() { try { const sycl::device &sycl_device = sycl::device(sycl::gpu_selector_v); const Device open3d_device = Device("SYCL:0"); - devices_.push_back(open3d_device); - device_to_sycl_device_[open3d_device] = sycl_device; - device_to_default_queue_[open3d_device] = sycl::queue(sycl_device); + devices_.emplace(open3d_device, sycl_device); } catch (const sycl::exception &e) { } @@ -66,9 +89,7 @@ SYCLContext::SYCLContext() { const sycl::device &sycl_device = sycl::device(sycl::cpu_selector_v); const Device open3d_device = Device("SYCL:" + std::to_string(devices_.size())); - devices_.push_back(open3d_device); - device_to_sycl_device_[open3d_device] = sycl_device; - device_to_default_queue_[open3d_device] = sycl::queue(sycl_device); + devices_.emplace(open3d_device, sycl_device); } catch (const sycl::exception &e) { } diff --git a/cpp/open3d/core/SYCLContext.h b/cpp/open3d/core/SYCLContext.h index 341da29369e..5c2a160b75e 100644 --- a/cpp/open3d/core/SYCLContext.h +++ b/cpp/open3d/core/SYCLContext.h @@ -14,8 +14,8 @@ #pragma once +#include #include -#include #include "open3d/core/Device.h" @@ -23,6 +23,18 @@ namespace open3d { namespace core { namespace sy { +/// @brief SYCL device properties. +struct SYCLDevice { + SYCLDevice(const sycl::device& sycl_device); + std::string name; ///< Fiendlly / descriptive name of the device. + sycl::device device; ///< SYCL device. + sycl::queue queue; ///< Default queue for this device. + size_t max_work_group_size; ///< Preferred work group size + bool fp64; ///< Double precision support, else need to emulate. + bool usm_device_allocations; ///< USM device allocations required for + ///< Open3D. +}; + /// Singleton SYCL context manager. It maintains: /// - A default queue for each SYCL device class SYCLContext { @@ -45,17 +57,16 @@ class SYCLContext { /// Get the default SYCL queue given an Open3D device. sycl::queue GetDefaultQueue(const Device& device); + /// Get SYCL device properties given an Open3D device. + SYCLDevice GetDeviceProperties(const Device& device) { + return devices_.at(device); + }; + private: SYCLContext(); - /// List of available Open3D SYCL devices. - std::vector devices_; - - /// Maps core::Device to the corresponding default SYCL queue. - std::unordered_map device_to_default_queue_; - - /// Maps core::Device to sycl::device. Internal use only for now. - std::unordered_map device_to_sycl_device_; + /// Map from available Open3D SYCL devices to their properties. + std::map devices_; }; } // namespace sy diff --git a/cpp/open3d/core/kernel/Arange.cpp b/cpp/open3d/core/kernel/Arange.cpp index b3385a2a10c..3c85401eece 100644 --- a/cpp/open3d/core/kernel/Arange.cpp +++ b/cpp/open3d/core/kernel/Arange.cpp @@ -63,6 +63,12 @@ Tensor Arange(const Tensor& start, const Tensor& stop, const Tensor& step) { if (device.IsCPU()) { ArangeCPU(start, stop, step, dst); + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ArangeSYCL(start, stop, step, dst); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE ArangeCUDA(start, stop, step, dst); diff --git a/cpp/open3d/core/kernel/Arange.h b/cpp/open3d/core/kernel/Arange.h index 1a0e88323fd..4547960b5f4 100644 --- a/cpp/open3d/core/kernel/Arange.h +++ b/cpp/open3d/core/kernel/Arange.h @@ -20,6 +20,13 @@ void ArangeCPU(const Tensor& start, const Tensor& step, Tensor& dst); +#ifdef BUILD_SYCL_MODULE +void ArangeSYCL(const Tensor& start, + const Tensor& stop, + const Tensor& step, + Tensor& dst); +#endif + #ifdef BUILD_CUDA_MODULE void ArangeCUDA(const Tensor& start, const Tensor& stop, diff --git a/cpp/open3d/core/kernel/ArangeSYCL.cpp b/cpp/open3d/core/kernel/ArangeSYCL.cpp new file mode 100644 index 00000000000..cda8912be90 --- /dev/null +++ b/cpp/open3d/core/kernel/ArangeSYCL.cpp @@ -0,0 +1,37 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/kernel/Arange.h" + +namespace open3d { +namespace core { +namespace kernel { + +void ArangeSYCL(const Tensor& start, + const Tensor& stop, + const Tensor& step, + Tensor& dst) { + Dtype dtype = start.GetDtype(); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(start.GetDevice()); + DISPATCH_DTYPE_TO_TEMPLATE(dtype, [&]() { + scalar_t sstart = start.Item(); + scalar_t sstep = step.Item(); + scalar_t* dst_ptr = dst.GetDataPtr(); + int64_t n = dst.GetLength(); + queue.parallel_for(n, [=](int64_t i) { + dst_ptr[i] = sstart + static_cast(sstep * i); + }).wait_and_throw(); + }); +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/BinaryEW.cpp b/cpp/open3d/core/kernel/BinaryEW.cpp index 00eb9b388e5..e34122cd137 100644 --- a/cpp/open3d/core/kernel/BinaryEW.cpp +++ b/cpp/open3d/core/kernel/BinaryEW.cpp @@ -51,6 +51,12 @@ void BinaryEW(const Tensor& lhs, if (lhs.IsCPU()) { BinaryEWCPU(lhs, rhs, dst, op_code); + } else if (lhs.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + BinaryEWSYCL(lhs, rhs, dst, op_code); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (lhs.IsCUDA()) { #ifdef BUILD_CUDA_MODULE BinaryEWCUDA(lhs, rhs, dst, op_code); diff --git a/cpp/open3d/core/kernel/BinaryEW.h b/cpp/open3d/core/kernel/BinaryEW.h index fbd4af3e849..2f2cb57888e 100644 --- a/cpp/open3d/core/kernel/BinaryEW.h +++ b/cpp/open3d/core/kernel/BinaryEW.h @@ -48,6 +48,13 @@ void BinaryEWCPU(const Tensor& lhs, Tensor& dst, BinaryEWOpCode op_code); +#ifdef BUILD_SYCL_MODULE +void BinaryEWSYCL(const Tensor& lhs, + const Tensor& rhs, + Tensor& dst, + BinaryEWOpCode op_code); +#endif + #ifdef BUILD_CUDA_MODULE void BinaryEWCUDA(const Tensor& lhs, const Tensor& rhs, diff --git a/cpp/open3d/core/kernel/BinaryEWSYCL.cpp b/cpp/open3d/core/kernel/BinaryEWSYCL.cpp new file mode 100644 index 00000000000..1466b11f97e --- /dev/null +++ b/cpp/open3d/core/kernel/BinaryEWSYCL.cpp @@ -0,0 +1,270 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/Dtype.h" +#include "open3d/core/Indexer.h" +#include "open3d/core/MemoryManager.h" +#include "open3d/core/ParallelForSYCL.h" +#include "open3d/core/SizeVector.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/kernel/BinaryEW.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +namespace kernel { + +namespace { + +struct BinaryElementKernel { + void operator()(int64_t i) {} + BinaryElementKernel(Indexer indexer_) : indexer(indexer_) {} + +protected: + Indexer indexer; +}; + +// Min, Max +#define BINARY_ELEMENT_KERNEL(name, elem_fn) \ + template \ + struct name##ElementKernel : public BinaryElementKernel { \ + using BinaryElementKernel::BinaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* lhs = indexer.GetInputPtr(0, i); \ + const src_t* rhs = indexer.GetInputPtr(1, i); \ + dst_t* dst = indexer.GetOutputPtr(i); \ + *dst = elem_fn(*lhs, *rhs); \ + } \ + } + +BINARY_ELEMENT_KERNEL(Max, sycl::max); +BINARY_ELEMENT_KERNEL(Min, sycl::min); +#undef BINARY_ELEMENT_KERNEL + +/// Specialize Min, Max for Bool, since sycl::min, sycl::max do not support it. +template <> +struct MaxElementKernel : public BinaryElementKernel { + using BinaryElementKernel::BinaryElementKernel; + void operator()(int64_t i) { + const bool* lhs = indexer.GetInputPtr(0, i); + const bool* rhs = indexer.GetInputPtr(1, i); + bool* dst = indexer.GetOutputPtr(i); + *dst = *lhs || *rhs; + } +}; +template <> +struct MinElementKernel : public BinaryElementKernel { + using BinaryElementKernel::BinaryElementKernel; + void operator()(int64_t i) { + const bool* lhs = indexer.GetInputPtr(0, i); + const bool* rhs = indexer.GetInputPtr(1, i); + bool* dst = indexer.GetOutputPtr(i); + *dst = *lhs && *rhs; + } +}; + +// Arithmetic and Relational ops. +#define BINARY_ELEMENT_KERNEL(name, elem_op) \ + template \ + struct name##ElementKernel : public BinaryElementKernel { \ + using BinaryElementKernel::BinaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* lhs = indexer.GetInputPtr(0, i); \ + const src_t* rhs = indexer.GetInputPtr(1, i); \ + dst_t* dst = indexer.GetOutputPtr(i); \ + *dst = (*lhs)elem_op(*rhs); \ + } \ + } + +BINARY_ELEMENT_KERNEL(Add, +); +BINARY_ELEMENT_KERNEL(Sub, -); +BINARY_ELEMENT_KERNEL(Mul, *); +BINARY_ELEMENT_KERNEL(Div, /); +BINARY_ELEMENT_KERNEL(Gt, >); +BINARY_ELEMENT_KERNEL(Lt, <); +BINARY_ELEMENT_KERNEL(Geq, >=); +BINARY_ELEMENT_KERNEL(Leq, <=); +BINARY_ELEMENT_KERNEL(Eq, ==); +BINARY_ELEMENT_KERNEL(Neq, !=); +#undef BINARY_ELEMENT_KERNEL + +// Logical ops. +#define BINARY_ELEMENT_KERNEL(name, elem_op) \ + template \ + struct name##ElementKernel : public BinaryElementKernel { \ + using BinaryElementKernel::BinaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* lhs = indexer.GetInputPtr(0, i); \ + const src_t* rhs = indexer.GetInputPtr(1, i); \ + dst_t* dst = indexer.GetOutputPtr(i); \ + *dst = static_cast(*lhs) elem_op static_cast(*rhs); \ + } \ + } +BINARY_ELEMENT_KERNEL(LogicalAnd, &&); +BINARY_ELEMENT_KERNEL(LogicalOr, ||); +BINARY_ELEMENT_KERNEL(LogicalXor, !=); +#undef BINARY_ELEMENT_KERNEL + +} // namespace + +void BinaryEWSYCL(const Tensor& lhs, + const Tensor& rhs, + Tensor& dst, + BinaryEWOpCode op_code) { + Dtype src_dtype = lhs.GetDtype(); + Dtype dst_dtype = dst.GetDtype(); + Device device = lhs.GetDevice(); + + if (s_boolean_binary_ew_op_codes.find(op_code) != + s_boolean_binary_ew_op_codes.end()) { + if (dst_dtype == src_dtype) { + // Inplace boolean op's output type is the same as the + // input. e.g. np.logical_and(a, b, out=a), where a, b are + // floats. + Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + switch (op_code) { + case BinaryEWOpCode::LogicalAnd: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::LogicalOr: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::LogicalXor: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Gt: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Lt: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Ge: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Le: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Eq: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Ne: + ParallelForSYCL>(device, + indexer); + break; + default: + break; + } + }); + } else if (dst_dtype == core::Bool) { + // By default, output is boolean type. + Indexer indexer({lhs, rhs}, dst, + DtypePolicy::INPUT_SAME_OUTPUT_BOOL); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + switch (op_code) { + case BinaryEWOpCode::LogicalAnd: + ParallelForSYCL< + LogicalAndElementKernel>( + device, indexer); + break; + case BinaryEWOpCode::LogicalOr: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::LogicalXor: + ParallelForSYCL< + LogicalXorElementKernel>( + device, indexer); + break; + case BinaryEWOpCode::Gt: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Lt: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Ge: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Le: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Eq: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Ne: + ParallelForSYCL>( + device, indexer); + break; + default: + break; + } + }); + } else { + utility::LogError( + "Boolean op's output type must be boolean or the " + "same type as the input."); + } + } else if (op_code == BinaryEWOpCode::Maximum || + op_code == BinaryEWOpCode::Minimum) { + Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + switch (op_code) { + case BinaryEWOpCode::Maximum: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Minimum: + ParallelForSYCL>(device, + indexer); + break; + default: + break; + } + }); + } else { + Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() { + switch (op_code) { + case BinaryEWOpCode::Add: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Sub: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Mul: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Div: + ParallelForSYCL>(device, + indexer); + break; + default: + break; + } + }); + } +} +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/IndexGetSet.cpp b/cpp/open3d/core/kernel/IndexGetSet.cpp index 601b4d73f64..b880f50fb51 100644 --- a/cpp/open3d/core/kernel/IndexGetSet.cpp +++ b/cpp/open3d/core/kernel/IndexGetSet.cpp @@ -35,6 +35,10 @@ void IndexGet(const Tensor& src, if (src.IsCPU()) { IndexGetCPU(src, dst, index_tensors, indexed_shape, indexed_strides); + } else if (src.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + IndexGetSYCL(src, dst, index_tensors, indexed_shape, indexed_strides); +#endif } else if (src.IsCUDA()) { #ifdef BUILD_CUDA_MODULE IndexGetCUDA(src, dst, index_tensors, indexed_shape, indexed_strides); @@ -56,6 +60,11 @@ void IndexSet(const Tensor& src, if (dst.IsCPU()) { IndexSetCPU(src_same_device, dst, index_tensors, indexed_shape, indexed_strides); + } else if (dst.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + IndexSetSYCL(src_same_device, dst, index_tensors, indexed_shape, + indexed_strides); +#endif } else if (dst.IsCUDA()) { #ifdef BUILD_CUDA_MODULE IndexSetCUDA(src_same_device, dst, index_tensors, indexed_shape, diff --git a/cpp/open3d/core/kernel/IndexGetSet.h b/cpp/open3d/core/kernel/IndexGetSet.h index 130b80836a9..c4a6d3b22c2 100644 --- a/cpp/open3d/core/kernel/IndexGetSet.h +++ b/cpp/open3d/core/kernel/IndexGetSet.h @@ -26,6 +26,14 @@ void IndexGetCPU(const Tensor& src, const SizeVector& indexed_shape, const SizeVector& indexed_strides); +#ifdef BUILD_SYCL_MODULE +void IndexGetSYCL(const Tensor& src, + Tensor& dst, + const std::vector& index_tensors, + const SizeVector& indexed_shape, + const SizeVector& indexed_strides); +#endif + #ifdef BUILD_CUDA_MODULE void IndexGetCUDA(const Tensor& src, Tensor& dst, @@ -46,6 +54,14 @@ void IndexSetCPU(const Tensor& src, const SizeVector& indexed_shape, const SizeVector& indexed_strides); +#ifdef BUILD_SYCL_MODULE +void IndexSetSYCL(const Tensor& src, + Tensor& dst, + const std::vector& index_tensors, + const SizeVector& indexed_shape, + const SizeVector& indexed_strides); +#endif + #ifdef BUILD_CUDA_MODULE void IndexSetCUDA(const Tensor& src, Tensor& dst, diff --git a/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp b/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp new file mode 100644 index 00000000000..dfc56397417 --- /dev/null +++ b/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp @@ -0,0 +1,76 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/AdvancedIndexing.h" +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/kernel/IndexGetSet.h" + +namespace open3d { +namespace core { +namespace kernel { + +void IndexGetSYCL(const Tensor& src, + Tensor& dst, + const std::vector& index_tensors, + const SizeVector& indexed_shape, + const SizeVector& indexed_strides) { + Dtype dtype = src.GetDtype(); + AdvancedIndexer ai(src, dst, index_tensors, indexed_shape, indexed_strides, + AdvancedIndexer::AdvancedIndexerMode::GET); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice()); + if (dtype.IsObject()) { + int64_t object_byte_size = dtype.ByteSize(); + for (int64_t idx = 0; idx < ai.NumWorkloads(); ++idx) { + queue.memcpy(ai.GetOutputPtr(idx), ai.GetInputPtr(idx), + object_byte_size); + } + } else { + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dtype, [&]() { + queue.parallel_for(ai.NumWorkloads(), [ai](int64_t idx) { + // char* -> scalar_t* needs reinterpret_cast + *reinterpret_cast(ai.GetOutputPtr(idx)) = + *reinterpret_cast( + ai.GetInputPtr(idx)); + }).wait_and_throw(); + }); + } +} + +void IndexSetSYCL(const Tensor& src, + Tensor& dst, + const std::vector& index_tensors, + const SizeVector& indexed_shape, + const SizeVector& indexed_strides) { + Dtype dtype = src.GetDtype(); + AdvancedIndexer ai(src, dst, index_tensors, indexed_shape, indexed_strides, + AdvancedIndexer::AdvancedIndexerMode::SET); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice()); + if (dtype.IsObject()) { + int64_t object_byte_size = dtype.ByteSize(); + for (int64_t idx = 0; idx < ai.NumWorkloads(); ++idx) { + queue.memcpy(ai.GetOutputPtr(idx), ai.GetInputPtr(idx), + object_byte_size); + } + } else { + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dtype, [&]() { + queue.parallel_for(ai.NumWorkloads(), [ai](int64_t idx) { + // char* -> scalar_t* needs reinterpret_cast + *reinterpret_cast(ai.GetOutputPtr(idx)) = + *reinterpret_cast( + ai.GetInputPtr(idx)); + }).wait_and_throw(); + }); + } +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/IndexReduction.cpp b/cpp/open3d/core/kernel/IndexReduction.cpp index 19265d36620..e9c9df3f388 100644 --- a/cpp/open3d/core/kernel/IndexReduction.cpp +++ b/cpp/open3d/core/kernel/IndexReduction.cpp @@ -35,6 +35,10 @@ void IndexAdd_(int64_t dim, if (dst.IsCPU()) { IndexAddCPU_(dim, index, src_permute, dst_permute); + } else if (dst.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + IndexAddSYCL_(dim, index, src_permute, dst_permute); +#endif } else if (dst.IsCUDA()) { #ifdef BUILD_CUDA_MODULE IndexAddCUDA_(dim, index, src_permute, dst_permute); diff --git a/cpp/open3d/core/kernel/IndexReduction.h b/cpp/open3d/core/kernel/IndexReduction.h index f4fd3516b41..2b4e122f3f0 100644 --- a/cpp/open3d/core/kernel/IndexReduction.h +++ b/cpp/open3d/core/kernel/IndexReduction.h @@ -24,6 +24,13 @@ void IndexAddCPU_(int64_t dim, const Tensor& src, Tensor& dst); +#ifdef BUILD_SYCL_MODULE +void IndexAddSYCL_(int64_t dim, + const Tensor& index, + const Tensor& src, + Tensor& dst); +#endif + #ifdef BUILD_CUDA_MODULE void IndexAddCUDA_(int64_t dim, const Tensor& index, diff --git a/cpp/open3d/core/kernel/IndexReductionSYCL.cpp b/cpp/open3d/core/kernel/IndexReductionSYCL.cpp new file mode 100644 index 00000000000..47da284dc93 --- /dev/null +++ b/cpp/open3d/core/kernel/IndexReductionSYCL.cpp @@ -0,0 +1,61 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/Indexer.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +namespace kernel { + +void IndexAddSYCL_(int64_t dim, + const Tensor& index, + const Tensor& src, + Tensor& dst) { + // index: [N,], src: [N, D], dst: [M, D] + // In Indexer, output shape defines the actual primary strides. + // However, in IndexAdd_, input dominates the iterations. + // So put dst (output) at indexer's input, and src (input) at output. + Indexer indexer({dst}, src, DtypePolicy::NONE); + + // Index is simply a 1D contiguous tensor, with a different stride + // behavior to src. So use raw pointer for simplicity. + auto index_ptr = index.GetDataPtr(); + + int64_t broadcasting_elems = 1; + for (int64_t d = 1; d < src.NumDims(); ++d) { + broadcasting_elems *= src.GetShape(d); + } + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice()); + + // TODO: Replace with SYCL reduction API + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() { + queue.parallel_for(index.GetLength(), [=](int64_t workload_idx) { + int64_t reduction_idx = workload_idx / broadcasting_elems; + int64_t broadcasting_idx = workload_idx % broadcasting_elems; + + const int64_t idx = index_ptr[reduction_idx]; + int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx; + + // Note input and output is switched here to adapt to the + // indexer + scalar_t* src_ptr = indexer.GetOutputPtr(0, idx); + scalar_t* dst_ptr = indexer.GetInputPtr(0, dst_idx); + sycl::atomic_ref(*dst_ptr) += + *src_ptr; + }).wait_and_throw(); + }); +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/NonZero.cpp b/cpp/open3d/core/kernel/NonZero.cpp index 6d04f94e40a..686ce9ea885 100644 --- a/cpp/open3d/core/kernel/NonZero.cpp +++ b/cpp/open3d/core/kernel/NonZero.cpp @@ -18,6 +18,12 @@ namespace kernel { Tensor NonZero(const Tensor& src) { if (src.IsCPU()) { return NonZeroCPU(src); + } else if (src.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + return NonZeroSYCL(src); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (src.IsCUDA()) { #ifdef BUILD_CUDA_MODULE return NonZeroCUDA(src); diff --git a/cpp/open3d/core/kernel/NonZero.h b/cpp/open3d/core/kernel/NonZero.h index ab926ddd8c3..36ac8b4f5a0 100644 --- a/cpp/open3d/core/kernel/NonZero.h +++ b/cpp/open3d/core/kernel/NonZero.h @@ -17,6 +17,10 @@ Tensor NonZero(const Tensor& src); Tensor NonZeroCPU(const Tensor& src); +#ifdef BUILD_SYCL_MODULE +Tensor NonZeroSYCL(const Tensor& src); +#endif + #ifdef BUILD_CUDA_MODULE Tensor NonZeroCUDA(const Tensor& src); #endif diff --git a/cpp/open3d/core/kernel/NonZeroSYCL.cpp b/cpp/open3d/core/kernel/NonZeroSYCL.cpp new file mode 100644 index 00000000000..9a9795e9f6a --- /dev/null +++ b/cpp/open3d/core/kernel/NonZeroSYCL.cpp @@ -0,0 +1,75 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include +#include +#include + +#include "open3d/core/Indexer.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/kernel/NonZero.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +/// Maximum number of dimensions of TensorRef, rounded up to the power of 2 for +/// sycl::vec support. +static constexpr int64_t MAX_DIMS_POW2 = 4; +static_assert(MAX_DIMS_POW2 >= MAX_DIMS, "MAX_DIMS_POW2 too small."); +namespace kernel { + +Tensor NonZeroSYCL(const Tensor& src) { + // Get flattened non-zero indices. + TensorIterator src_iter(src); + const int64_t num_elements = src.NumElements(); + auto device = src.GetDevice(); + Tensor indices = Tensor::Arange(0, num_elements, 1, core::Int64, device); + Tensor non_zero_indices(SizeVector({num_elements}), Int64, device); + int64_t *non_zero_indices_ptr = non_zero_indices.GetDataPtr(), + *indices_ptr = indices.GetDataPtr(); + size_t num_non_zeros; + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src.GetDtype(), [&]() { + auto it = std::copy_if( + oneapi::dpl::execution::dpcpp_default, indices_ptr, + indices_ptr + num_elements, non_zero_indices_ptr, + [src_iter](int64_t index) { + auto src_ptr = static_cast( + src_iter.GetPtr(index)); + OPEN3D_ASSERT(src_ptr != nullptr && "Internal error."); + return *src_ptr != 0; + }); + num_non_zeros = std::distance(non_zero_indices_ptr, it); + }); + + // Transform flattened indices to indices in each dimension. + const auto num_dims = src.NumDims(); + SizeVector shape = src.GetShape(); + sycl::vec shape_vec; // device copyable + OPEN3D_ASSERT(shape.size() <= MAX_DIMS_POW2 && "Too many dimensions."); + for (auto k = 0; k < num_dims; ++k) shape_vec[k] = shape[k]; + Tensor result({num_dims, static_cast(num_non_zeros)}, Int64, + device); + int64_t* result_ptr = result.GetDataPtr(); + auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + queue.parallel_for(num_non_zeros, [=](int64_t i) { + auto non_zero_index = non_zero_indices_ptr[i]; + auto this_result_ptr = + result_ptr + i + (num_dims - 1) * num_non_zeros; + OPEN3D_ASSERT(this_result_ptr != nullptr && "Internal error."); + for (auto dim = num_dims - 1; dim >= 0; + dim--, this_result_ptr -= num_non_zeros) { + *this_result_ptr = non_zero_index % shape_vec[dim]; + non_zero_index = non_zero_index / shape_vec[dim]; + } + }).wait_and_throw(); + return result; +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/Reduction.cpp b/cpp/open3d/core/kernel/Reduction.cpp index a94087d609a..2b3052a2c48 100644 --- a/cpp/open3d/core/kernel/Reduction.cpp +++ b/cpp/open3d/core/kernel/Reduction.cpp @@ -71,6 +71,18 @@ void Reduction(const Tensor& src, if (src.IsCPU()) { ReductionCPU(src, dst, dims, keepdim, op_code); + } else if (src.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + // Tensor dst_cpu = + // Tensor::Empty(dst.GetShape(), dst.GetDtype(), + // Device("SYCL:1")); + // ReductionSYCL(src.To(Device("SYCL:1")), dst_cpu, dims, keepdim, + // op_code); + // dst.CopyFrom(dst_cpu); + ReductionSYCL(src, dst, dims, keepdim, op_code); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (src.IsCUDA()) { #ifdef BUILD_CUDA_MODULE ReductionCUDA(src, dst, dims, keepdim, op_code); diff --git a/cpp/open3d/core/kernel/Reduction.h b/cpp/open3d/core/kernel/Reduction.h index a1a4b74e3a3..ec3bec62ab7 100644 --- a/cpp/open3d/core/kernel/Reduction.h +++ b/cpp/open3d/core/kernel/Reduction.h @@ -59,6 +59,14 @@ void ReductionCPU(const Tensor& src, bool keepdim, ReductionOpCode op_code); +#ifdef BUILD_SYCL_MODULE +void ReductionSYCL(const Tensor& src, + Tensor& dst, + const SizeVector& dims, + bool keepdim, + ReductionOpCode op_code); +#endif + #ifdef BUILD_CUDA_MODULE void ReductionCUDA(const Tensor& src, Tensor& dst, diff --git a/cpp/open3d/core/kernel/ReductionSYCL.cpp b/cpp/open3d/core/kernel/ReductionSYCL.cpp new file mode 100644 index 00000000000..b4c96112866 --- /dev/null +++ b/cpp/open3d/core/kernel/ReductionSYCL.cpp @@ -0,0 +1,164 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "open3d/core/Dispatch.h" +#include "open3d/core/Indexer.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/kernel/Reduction.h" +#include "open3d/utility/Logging.h" +#include "open3d/utility/Parallel.h" + +namespace open3d { +namespace core { +namespace kernel { + +namespace { +// Based on OneAPI GPU optimization guide code sample (Blocked access to input +// data + SYCL builtin reduction ops for final reduction) +template +void SYCLReductionEngine( + Device device, + Indexer indexer, + scalar_t identity = + sycl::known_identity::value) { + auto device_props = + sy::SYCLContext::GetInstance().GetDeviceProperties(device); + auto queue = device_props.queue; + auto work_group_size = device_props.max_work_group_size; + size_t log2elements_per_group = 13; + auto elements_per_group = (1 << log2elements_per_group); // 8192 + size_t log2workitems_per_group = 8; + auto workitems_per_group = (1 << log2workitems_per_group); // 256 + auto elements_per_work_item = + elements_per_group / workitems_per_group; // 32 (= max SIMD sizse) + auto mask = ~(~0 << log2workitems_per_group); + ReductionOp red_op; + for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements(); + output_idx++) { + // sub_indexer.NumWorkloads() == ipo. + // sub_indexer's workload_idx is indexer's ipo_idx. + Indexer scalar_out_indexer = indexer.GetPerOutputIndexer(output_idx); + auto num_elements = scalar_out_indexer.NumWorkloads(); + auto num_work_groups = num_elements / elements_per_group; + if (num_elements > elements_per_group * num_work_groups) + ++num_work_groups; + // ensure each work group has work_group_size work items + auto num_work_items = num_work_groups * work_group_size; + auto output = + reinterpret_cast(scalar_out_indexer.GetOutputPtr(0)); + auto e = queue.submit([&](auto& cgh) { + auto sycl_reducer = sycl::reduction( + output, identity, red_op, + {sycl::property::reduction::initialize_to_identity()}); + cgh.parallel_for( + sycl::nd_range<1>{num_work_items, work_group_size}, + sycl_reducer, [=](sycl::nd_item<1> item, auto& red_arg) { + auto glob_id = item.get_global_id(0); + auto offset = ((glob_id >> log2workitems_per_group) + << log2elements_per_group) + + (glob_id & mask); + auto item_out = identity; + for (size_t i = 0; i < elements_per_work_item; i++) { + size_t idx = + (i << log2workitems_per_group) + offset; + if (idx >= num_elements) break; + scalar_t* val = reinterpret_cast( + scalar_out_indexer.GetInputPtr(0, idx)); + item_out = red_op(item_out, *val); + } + red_arg.combine(item_out); + }); + }); + } + queue.wait_and_throw(); +} +} // namespace + +void ReductionSYCL(const Tensor& src, + Tensor& dst, + const SizeVector& dims, + bool keepdim, + ReductionOpCode op_code) { + Device device = src.GetDevice(); + if (s_regular_reduce_ops.find(op_code) != s_regular_reduce_ops.end()) { + Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims); + DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() { + switch (op_code) { + case ReductionOpCode::Sum: + dst.Fill(0); + SYCLReductionEngine, scalar_t>( + device, indexer); + break; + case ReductionOpCode::Prod: + dst.Fill(1); + SYCLReductionEngine, scalar_t>( + device, indexer); + break; + case ReductionOpCode::Min: + if (indexer.NumWorkloads() == 0) { + utility::LogError( + "Zero-size Tensor does not support Min."); + } else { + SYCLReductionEngine, scalar_t>( + device, indexer); + } + break; + case ReductionOpCode::Max: + if (indexer.NumWorkloads() == 0) { + utility::LogError( + "Zero-size Tensor does not support Max."); + } else { + SYCLReductionEngine, scalar_t>( + device, indexer); + } + break; + default: + utility::LogError("Unsupported op code."); + break; + } + }); + } else if (s_arg_reduce_ops.find(op_code) != s_arg_reduce_ops.end()) { + utility::LogError("SYCL Arg-reduction is not implemented."); + } else if (s_boolean_reduce_ops.find(op_code) != + s_boolean_reduce_ops.end()) { + if (src.GetDtype() != core::Bool) { + utility::LogError( + "Boolean reduction only supports boolean input tensor."); + } + if (dst.GetDtype() != core::Bool) { + utility::LogError( + "Boolean reduction only supports boolean output tensor."); + } + Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims); + switch (op_code) { + case ReductionOpCode::All: + // Identity == true. 0-sized tensor, returns true. + dst.Fill(true); + SYCLReductionEngine, bool>(device, + indexer); + break; + case ReductionOpCode::Any: + // Identity == false. 0-sized tensor, returns false. + dst.Fill(false); + SYCLReductionEngine, bool>(device, + indexer); + break; + default: + utility::LogError("Unsupported op code."); + break; + } + } else { + utility::LogError("Unsupported op code."); + } +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/UnaryEW.cpp b/cpp/open3d/core/kernel/UnaryEW.cpp index 911b2885e34..2513d41205d 100644 --- a/cpp/open3d/core/kernel/UnaryEW.cpp +++ b/cpp/open3d/core/kernel/UnaryEW.cpp @@ -22,6 +22,19 @@ void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { src.GetShape(), dst.GetShape()); } + // Check dtype compatibility + const auto float_only_ops = {UnaryEWOpCode::Sqrt, UnaryEWOpCode::Sin, + UnaryEWOpCode::Cos, UnaryEWOpCode::Exp, + UnaryEWOpCode::IsNan, UnaryEWOpCode::IsInf, + UnaryEWOpCode::IsFinite}; + Dtype src_dtype = src.GetDtype(); + if (std::find(float_only_ops.begin(), float_only_ops.end(), op_code) != + float_only_ops.end() && + src_dtype != core::Float32 && src_dtype != core::Float64) { + utility::LogError("Only supports Float32 and Float64, but {} is used.", + src_dtype.ToString()); + } + // Dispatch to device Device src_device = src.GetDevice(); Device dst_device = dst.GetDevice(); @@ -32,6 +45,12 @@ void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { if (src_device.IsCPU()) { UnaryEWCPU(src, dst, op_code); + } else if (src_device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + UnaryEWSYCL(src, dst, op_code); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (src_device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE UnaryEWCUDA(src, dst, op_code); @@ -73,6 +92,8 @@ void Copy(const Tensor& src, Tensor& dst) { #else utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif + } else { + utility::LogError("Copy: SYCL <-> CUDA Unimplemented device"); } } diff --git a/cpp/open3d/core/kernel/UnaryEW.h b/cpp/open3d/core/kernel/UnaryEW.h index 907b3371167..131dff08b91 100644 --- a/cpp/open3d/core/kernel/UnaryEW.h +++ b/cpp/open3d/core/kernel/UnaryEW.h @@ -32,15 +32,18 @@ enum class UnaryEWOpCode { }; void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code); - void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code); +#ifdef BUILD_SYCL_MODULE +void UnaryEWSYCL(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code); +#endif + #ifdef BUILD_CUDA_MODULE void UnaryEWCUDA(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code); #endif -// Copy is separated from other unary ops since it support cross-device copy and -// dtype casting. +// Copy is separated from other unary ops since it supports cross-device copy +// and dtype casting. void Copy(const Tensor& src, Tensor& dst); void CopyCPU(const Tensor& src, Tensor& dst); diff --git a/cpp/open3d/core/kernel/UnaryEWCPU.cpp b/cpp/open3d/core/kernel/UnaryEWCPU.cpp index 1a502e42a1d..f4673200b24 100644 --- a/cpp/open3d/core/kernel/UnaryEWCPU.cpp +++ b/cpp/open3d/core/kernel/UnaryEWCPU.cpp @@ -218,14 +218,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { Dtype src_dtype = src.GetDtype(); Dtype dst_dtype = dst.GetDtype(); - auto assert_dtype_is_float = [](Dtype dtype) -> void { - if (dtype != core::Float32 && dtype != core::Float64) { - utility::LogError( - "Only supports Float32 and Float64, but {} is used.", - dtype.ToString()); - } - }; - if (op_code == UnaryEWOpCode::LogicalNot) { if (dst_dtype == src_dtype) { Indexer indexer({src}, dst, DtypePolicy::ALL_SAME); @@ -259,7 +251,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { } else if (op_code == UnaryEWOpCode::IsNan || op_code == UnaryEWOpCode::IsInf || op_code == UnaryEWOpCode::IsFinite) { - assert_dtype_is_float(src_dtype); Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL); #ifdef BUILD_ISPC_MODULE ispc::Indexer ispc_indexer = indexer.ToISPC(); @@ -291,7 +282,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() { switch (op_code) { case UnaryEWOpCode::Sqrt: - assert_dtype_is_float(src_dtype); LaunchUnaryEWKernel( indexer, CPUSqrtElementKernel, OPEN3D_TEMPLATE_VECTORIZED(scalar_t, @@ -299,7 +289,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { &ispc_indexer)); break; case UnaryEWOpCode::Sin: - assert_dtype_is_float(src_dtype); LaunchUnaryEWKernel( indexer, CPUSinElementKernel, OPEN3D_TEMPLATE_VECTORIZED(scalar_t, @@ -307,7 +296,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { &ispc_indexer)); break; case UnaryEWOpCode::Cos: - assert_dtype_is_float(src_dtype); LaunchUnaryEWKernel( indexer, CPUCosElementKernel, OPEN3D_TEMPLATE_VECTORIZED(scalar_t, @@ -322,7 +310,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { &ispc_indexer)); break; case UnaryEWOpCode::Exp: - assert_dtype_is_float(src_dtype); LaunchUnaryEWKernel( indexer, CPUExpElementKernel, OPEN3D_TEMPLATE_VECTORIZED(scalar_t, diff --git a/cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp b/cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp new file mode 100644 index 00000000000..fed05af7b47 --- /dev/null +++ b/cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp @@ -0,0 +1,53 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include +#include + +#include "open3d/core/Dtype.h" +#include "open3d/core/MemoryManager.h" +#include "open3d/core/SizeVector.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/kernel/UnaryEW.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +namespace kernel { + +void CopySYCL(const Tensor& src, Tensor& dst) { + // It has been checked that + // - at least one of src or dst is SYCL device + SizeVector shape = src.GetShape(); + Dtype src_dtype = src.GetDtype(); + Dtype dst_dtype = dst.GetDtype(); + Device dst_device = dst.GetDevice(); + Device src_device = src.GetDevice(); + + if (src_dtype != dst_dtype) { + utility::LogError( + "CopySYCL: Dtype conversion from src to dst not implemented!"); + } + if ((dst_device.IsSYCL() && !dst.IsContiguous()) || + (src_device.IsSYCL() && !src.IsContiguous())) { + utility::LogError( + "CopySYCL: NonContiguous SYCL tensor Copy not implemented!"); + } + Tensor src_conti = src.Contiguous(); // No op if already contiguous + if (dst.IsContiguous() && src.GetShape() == dst.GetShape() && + src_dtype == dst_dtype) { + MemoryManager::Memcpy(dst.GetDataPtr(), dst_device, + src_conti.GetDataPtr(), src_conti.GetDevice(), + src_dtype.ByteSize() * shape.NumElements()); + } else { + dst.CopyFrom(src_conti.To(dst_device)); + } +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/UnaryEWSYCL.cpp b/cpp/open3d/core/kernel/UnaryEWSYCL.cpp index fed05af7b47..40592cf6ded 100644 --- a/cpp/open3d/core/kernel/UnaryEWSYCL.cpp +++ b/cpp/open3d/core/kernel/UnaryEWSYCL.cpp @@ -8,8 +8,12 @@ #include #include +#include "open3d/core/Dispatch.h" #include "open3d/core/Dtype.h" +#include "open3d/core/Indexer.h" #include "open3d/core/MemoryManager.h" +#include "open3d/core/ParallelFor.h" +#include "open3d/core/ParallelForSYCL.h" #include "open3d/core/SizeVector.h" #include "open3d/core/Tensor.h" #include "open3d/core/kernel/UnaryEW.h" @@ -19,32 +23,274 @@ namespace open3d { namespace core { namespace kernel { +namespace { + +struct UnaryElementKernel { + UnaryElementKernel(Indexer indexer_) : indexer(indexer_) {} + void operator()(int64_t i) {} + +protected: + Indexer indexer; +}; + +template +struct CopyElementKernel : public UnaryElementKernel { + using UnaryElementKernel::UnaryElementKernel; + void operator()(int64_t i) { + const src_t* src = indexer.GetInputPtr(0, i); + dst_t* dst = indexer.GetOutputPtr(i); + *dst = static_cast(*src); + } +}; + +// Math: integers treated as double (C++11) +// no casting needed for float +#define UNARY_ELEMENT_KERNEL(name, elem_op) \ + template \ + struct name##ElementKernel : public UnaryElementKernel { \ + using UnaryElementKernel::UnaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* src = indexer.GetInputPtr(0, i); \ + src_t* dst = indexer.GetOutputPtr(i); \ + *dst = static_cast(elem_op(static_cast(*src))); \ + } \ + }; \ + template <> \ + struct name##ElementKernel : public UnaryElementKernel { \ + using UnaryElementKernel::UnaryElementKernel; \ + void operator()(int64_t i) { \ + const float* src = indexer.GetInputPtr(0, i); \ + float* dst = indexer.GetOutputPtr(i); \ + *dst = elem_op(*src); \ + } \ + } + +UNARY_ELEMENT_KERNEL(Sqrt, sycl::sqrt); +UNARY_ELEMENT_KERNEL(Sin, sycl::sin); +UNARY_ELEMENT_KERNEL(Cos, sycl::cos); +UNARY_ELEMENT_KERNEL(Exp, sycl::exp); +// TODO: Use sycl::abs for integers (no casting) +UNARY_ELEMENT_KERNEL(Abs, sycl::fabs); +UNARY_ELEMENT_KERNEL(Floor, sycl::floor); +UNARY_ELEMENT_KERNEL(Ceil, sycl::ceil); +UNARY_ELEMENT_KERNEL(Round, sycl::round); +UNARY_ELEMENT_KERNEL(Trunc, sycl::trunc); +#undef UNARY_ELEMENT_KERNEL + +// No special treatment for unsigned types - we use the SYCL runtime +// default +template +struct NegElementKernel : public UnaryElementKernel { + using UnaryElementKernel::UnaryElementKernel; + void operator()(int64_t i) { + const scalar_t* src = indexer.GetInputPtr(0, i); + scalar_t* dst = indexer.GetOutputPtr(i); + *dst = -*src; + } +}; + +// Float checkers: integers treated as double (C++11) +// no casting needed for float +#define UNARY_ELEMENT_KERNEL(name, elem_op) \ + template \ + struct name##ElementKernel : public UnaryElementKernel { \ + using UnaryElementKernel::UnaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* src = indexer.GetInputPtr(0, i); \ + bool* dst = indexer.GetOutputPtr(i); \ + *dst = elem_op(static_cast(*src)); \ + } \ + }; \ + template <> \ + struct name##ElementKernel : public UnaryElementKernel { \ + using UnaryElementKernel::UnaryElementKernel; \ + void operator()(int64_t i) { \ + const float* src = indexer.GetInputPtr(0, i); \ + bool* dst = indexer.GetOutputPtr(i); \ + *dst = elem_op(*src); \ + } \ + } + +UNARY_ELEMENT_KERNEL(IsNan, sycl::isnan); +UNARY_ELEMENT_KERNEL(IsInf, sycl::isinf); +UNARY_ELEMENT_KERNEL(IsFinite, sycl::isfinite); +#undef UNARY_ELEMENT_KERNEL + +template +struct LogicalNotElementKernel : public UnaryElementKernel { + using UnaryElementKernel::UnaryElementKernel; + void operator()(int64_t i) { + const src_t* src = indexer.GetInputPtr(0, i); + dst_t* dst = indexer.GetOutputPtr(i); + *dst = static_cast(!static_cast(*src)); + } +}; +} // namespace + void CopySYCL(const Tensor& src, Tensor& dst) { - // It has been checked that - // - at least one of src or dst is SYCL device + // src and dst have been checked to have the same shape + // at least one of src and dst is SYCL and the other is SYCL or CPU SizeVector shape = src.GetShape(); + Dtype src_dtype = src.GetDtype(), dst_dtype = dst.GetDtype(); + Device src_device = src.GetDevice(), dst_device = dst.GetDevice(); + Device device_with_queue = dst.IsSYCL() ? dst.GetDevice() : src.GetDevice(); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(device_with_queue); + + if (src_device.IsSYCL() && dst_device.IsSYCL()) { + if (src.IsContiguous() && dst.IsContiguous() && + src.GetShape() == dst.GetShape() && src_dtype == dst_dtype) { + MemoryManager::Memcpy(dst.GetDataPtr(), dst.GetDevice(), + src.GetDataPtr(), src.GetDevice(), + src_dtype.ByteSize() * shape.NumElements()); + } else if (dst.NumElements() > 1 && dst.IsContiguous() && + src.NumElements() == 1 /*&& !src_dtype.IsObject()*/) { + int64_t num_elements = dst.NumElements(); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() { + scalar_t scalar_element = src.To(dst_dtype).Item(); + scalar_t* dst_ptr = dst.GetDataPtr(); + queue.fill(dst_ptr, scalar_element, num_elements) + .wait_and_throw(); + }); + } else if (src_device == dst_device) { // non-contiguous or broadcast + // on same SYCL device + Indexer indexer({src}, dst, DtypePolicy::NONE); + if (src.GetDtype().IsObject()) { + // TODO: This is likely very slow. Coalesce into less memcpy + // calls. + int64_t object_byte_size = src.GetDtype().ByteSize(); + for (int64_t i = 0; i < indexer.NumWorkloads(); ++i) { + const void* src_ptr = indexer.GetInputPtr(0, i); + void* dst_ptr = indexer.GetOutputPtr(i); + queue.memcpy(dst_ptr, src_ptr, object_byte_size); + } + queue.wait_and_throw(); + } else { + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + using src_t = scalar_t; + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() { + using dst_t = scalar_t; + ParallelForSYCL>( + device_with_queue, indexer); + }); + }); + } + } else { + dst.CopyFrom(src.Contiguous().To(dst_device)); + } + } else if (src_device.IsCPU() && dst_device.IsSYCL() || + src_device.IsSYCL() && dst_device.IsCPU()) { + Tensor src_conti = src.Contiguous(); // No op if already contiguous + if (dst.IsContiguous() && src.GetShape() == dst.GetShape() && + src_dtype == dst_dtype) { + MemoryManager::Memcpy(dst.GetDataPtr(), dst_device, + src_conti.GetDataPtr(), src_conti.GetDevice(), + src_dtype.ByteSize() * shape.NumElements()); + } else { + dst.CopyFrom(src.Contiguous().To(dst_device)); + } + } else { + utility::LogError("Wrong device type {} -> {}", src_device.ToString(), + dst_device.ToString()); + } +} + +void UnaryEWSYCL(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { + // src and dst have been changed to have the same shape, device Dtype src_dtype = src.GetDtype(); Dtype dst_dtype = dst.GetDtype(); - Device dst_device = dst.GetDevice(); - Device src_device = src.GetDevice(); + Device device = src.GetDevice(); // == dst.GetDevice() - if (src_dtype != dst_dtype) { - utility::LogError( - "CopySYCL: Dtype conversion from src to dst not implemented!"); - } - if ((dst_device.IsSYCL() && !dst.IsContiguous()) || - (src_device.IsSYCL() && !src.IsContiguous())) { - utility::LogError( - "CopySYCL: NonContiguous SYCL tensor Copy not implemented!"); - } - Tensor src_conti = src.Contiguous(); // No op if already contiguous - if (dst.IsContiguous() && src.GetShape() == dst.GetShape() && - src_dtype == dst_dtype) { - MemoryManager::Memcpy(dst.GetDataPtr(), dst_device, - src_conti.GetDataPtr(), src_conti.GetDevice(), - src_dtype.ByteSize() * shape.NumElements()); + auto assert_dtype_is_float = [](Dtype dtype) -> void { + if (dtype != Float32 && dtype != Float64) { + utility::LogError( + "Only supports Float32 and Float64, but {} is used.", + dtype.ToString()); + } + }; + + if (op_code == UnaryEWOpCode::LogicalNot) { + if (dst_dtype == src_dtype) { + Indexer indexer({src}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + ParallelForSYCL>( + device, indexer); + }); + } else if (dst_dtype == Bool) { + Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + ParallelForSYCL>( + device, indexer); + }); + } else { + utility::LogError( + "Boolean op's output type must be boolean or the " + "same type as the input."); + } + } else if (op_code == UnaryEWOpCode::IsNan || + op_code == UnaryEWOpCode::IsInf || + op_code == UnaryEWOpCode::IsFinite) { + assert_dtype_is_float(src_dtype); + Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL); + DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() { + if (op_code == UnaryEWOpCode::IsNan) { + ParallelForSYCL>(device, indexer); + } else if (op_code == UnaryEWOpCode::IsInf) { + ParallelForSYCL>(device, indexer); + } else if (op_code == UnaryEWOpCode::IsFinite) { + ParallelForSYCL>(device, + indexer); + } + }); } else { - dst.CopyFrom(src_conti.To(dst_device)); + Indexer indexer({src}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() { + switch (op_code) { + case UnaryEWOpCode::Sqrt: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Sin: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Cos: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Neg: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Exp: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Abs: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Floor: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Ceil: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Round: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Trunc: + ParallelForSYCL>(device, + indexer); + break; + default: + utility::LogError("Unimplemented op_code for UnaryEWSYCL"); + break; + } + }); } } diff --git a/cpp/open3d/core/linalg/AddMM.cpp b/cpp/open3d/core/linalg/AddMM.cpp index aea908cecbb..45cfd71df7d 100644 --- a/cpp/open3d/core/linalg/AddMM.cpp +++ b/cpp/open3d/core/linalg/AddMM.cpp @@ -98,6 +98,13 @@ void AddMM(const Tensor& A, ldb, lda, ldc, dtype, device); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + AddMMSYCL(B_data, A_data, C_data, n, k, m, alpha, beta, transB, transA, + ldb, lda, ldc, dtype, device); +#else + utility::LogError("Unimplemented device."); #endif } else { AddMMCPU(B_data, A_data, C_data, n, k, m, alpha, beta, transB, transA, diff --git a/cpp/open3d/core/linalg/AddMM.h b/cpp/open3d/core/linalg/AddMM.h index 1754e430fef..6d26703ae1b 100644 --- a/cpp/open3d/core/linalg/AddMM.h +++ b/cpp/open3d/core/linalg/AddMM.h @@ -20,6 +20,24 @@ namespace core { void AddMM( const Tensor& A, const Tensor& B, Tensor& C, double alpha, double beta); +#ifdef BUILD_SYCL_MODULE +void AddMMSYCL(void* A_data, + void* B_data, + void* C_data, + int64_t m, + int64_t k, + int64_t n, + double alpha, + double beta, + bool gemmTrA, + bool gemmTrB, + int lda, + int ldb, + int ldc, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void AddMMCUDA(void* A_data, void* B_data, diff --git a/cpp/open3d/core/linalg/AddMMSYCL.cpp b/cpp/open3d/core/linalg/AddMMSYCL.cpp new file mode 100644 index 00000000000..591a4f26f00 --- /dev/null +++ b/cpp/open3d/core/linalg/AddMMSYCL.cpp @@ -0,0 +1,49 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/AddMM.h" +#include "open3d/core/linalg/LinalgUtils.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { + +void AddMMSYCL(void* A_data, + void* B_data, + void* C_data, + int64_t m, + int64_t k, + int64_t n, + double alpha, + double beta, + bool gemmTrA, + bool gemmTrB, + int lda, + int ldb, + int ldc, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + blas::column_major::gemm(queue, gemmTrA ? transpose::T : transpose::N, + gemmTrB ? transpose::T : transpose::N, m, n, k, + static_cast(alpha), + static_cast(A_data), lda, + static_cast(B_data), ldb, + static_cast(beta), + static_cast(C_data), ldc) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/Inverse.cpp b/cpp/open3d/core/linalg/Inverse.cpp index 51cf7693217..07cb507c1ca 100644 --- a/cpp/open3d/core/linalg/Inverse.cpp +++ b/cpp/open3d/core/linalg/Inverse.cpp @@ -55,6 +55,20 @@ void Inverse(const Tensor &A, Tensor &output) { output = output.T(); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + Tensor ipiv = Tensor::Empty({n}, core::Int64, device); + void *ipiv_data = ipiv.GetDataPtr(); + + // LAPACKE supports getri, A is in-place modified as output. + Tensor A_T = A.T().To(device, /*copy=*/true); + void *A_data = A_T.GetDataPtr(); + + InverseSYCL(A_data, ipiv_data, nullptr, n, dtype, device); + output = A_T.T(); +#else + utility::LogError("Unimplemented device."); #endif } else { Dtype ipiv_dtype; diff --git a/cpp/open3d/core/linalg/Inverse.h b/cpp/open3d/core/linalg/Inverse.h index 8809f8f176c..2d3cbb2bbcf 100644 --- a/cpp/open3d/core/linalg/Inverse.h +++ b/cpp/open3d/core/linalg/Inverse.h @@ -17,11 +17,20 @@ void Inverse(const Tensor& A, Tensor& output); void InverseCPU(void* A_data, void* ipiv_data, - void* output_data, + [[maybe_unused]] void* output_data, int64_t n, Dtype dtype, const Device& device); +#ifdef BUILD_SYCL_MODULE +void InverseSYCL(void* A_data, + void* ipiv_data, + [[maybe_unused]] void* output_data, + int64_t n, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void InverseCUDA(void* A_data, void* ipiv_data, diff --git a/cpp/open3d/core/linalg/InverseSYCL.cpp b/cpp/open3d/core/linalg/InverseSYCL.cpp new file mode 100644 index 00000000000..f66dd089cda --- /dev/null +++ b/cpp/open3d/core/linalg/InverseSYCL.cpp @@ -0,0 +1,48 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/Inverse.h" +#include "open3d/core/linalg/LinalgUtils.h" + +namespace open3d { +namespace core { + +void InverseSYCL(void* A_data, + void* ipiv_data, + void* output_data, + int64_t n, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + int64_t lda = n; + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + // Use blob to ensure cleanup of scratchpad memory. + int64_t scratchpad_size = std::max( + lapack::getrf_scratchpad_size(queue, n, n, lda), + lapack::getri_scratchpad_size(queue, n, lda)); + core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + auto lu_done = + lapack::getrf(queue, n, n, static_cast(A_data), lda, + static_cast(ipiv_data), + static_cast(scratchpad.GetDataPtr()), + scratchpad_size); + lapack::getri(queue, n, static_cast(A_data), lda, + static_cast(ipiv_data), + static_cast(scratchpad.GetDataPtr()), + scratchpad_size, {lu_done}) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/LU.cpp b/cpp/open3d/core/linalg/LU.cpp index 5bfafdac425..3a9a4e4788c 100644 --- a/cpp/open3d/core/linalg/LU.cpp +++ b/cpp/open3d/core/linalg/LU.cpp @@ -16,9 +16,9 @@ namespace open3d { namespace core { // Get column permutation tensor from ipiv (swapping index array). -static core::Tensor GetColPermutation(const Tensor& ipiv, - int number_of_indices, - int number_of_rows) { +static Tensor GetColPermutation(const Tensor& ipiv, + int number_of_indices, + int number_of_rows) { Tensor full_ipiv = Tensor::Arange(0, number_of_rows, 1, core::Int32, Device("CPU:0")); Tensor ipiv_cpu = ipiv.To(Device("CPU:0"), core::Int32, /*copy=*/false); @@ -42,14 +42,14 @@ static void OutputToPLU(const Tensor& output, const Tensor& ipiv, const bool permute_l) { int n = output.GetShape()[0]; - core::Device device = output.GetDevice(); + Device device = output.GetDevice(); // Get upper and lower matrix from output matrix. Triul(output, upper, lower, 0); // Get column permutation vector from pivot indices vector. Tensor col_permutation = GetColPermutation(ipiv, ipiv.GetShape()[0], n); // Creating "Permutation Matrix (P in P.A = L.U)". - permutation = core::Tensor::Eye(n, output.GetDtype(), device) + permutation = Tensor::Eye(n, output.GetDtype(), device) .IndexGet({col_permutation}); // Calculating P in A = P.L.U. [P.Inverse() = P.T()]. permutation = permutation.T().Contiguous(); @@ -88,15 +88,23 @@ void LUIpiv(const Tensor& A, Tensor& ipiv, Tensor& output) { // elements as U, (diagonal elements of L are unity), and ipiv array, // which has the pivot indices (for 1 <= i <= min(M,N), row i of the // matrix was interchanged with row IPIV(i). + int64_t ipiv_len = std::min(rows, cols); if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE CUDAScopedDevice scoped_device(device); - int64_t ipiv_len = std::min(rows, cols); - ipiv = core::Tensor::Empty({ipiv_len}, core::Int32, device); + ipiv = Tensor::Empty({ipiv_len}, core::Int32, device); void* ipiv_data = ipiv.GetDataPtr(); LUCUDA(A_data, ipiv_data, rows, cols, dtype, device); #else utility::LogInfo("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ipiv = Tensor::Empty({ipiv_len}, core::Int64, device); + void* ipiv_data = ipiv.GetDataPtr(); + LUSYCL(A_data, ipiv_data, rows, cols, dtype, device); +#else + utility::LogInfo("Unimplemented device."); #endif } else { Dtype ipiv_dtype; @@ -107,9 +115,7 @@ void LUIpiv(const Tensor& A, Tensor& ipiv, Tensor& output) { } else { utility::LogError("Unsupported OPEN3D_CPU_LINALG_INT type."); } - - int64_t ipiv_len = std::min(rows, cols); - ipiv = core::Tensor::Empty({ipiv_len}, ipiv_dtype, device); + ipiv = Tensor::Empty({ipiv_len}, ipiv_dtype, device); void* ipiv_data = ipiv.GetDataPtr(); LUCPU(A_data, ipiv_data, rows, cols, dtype, device); } @@ -125,7 +131,7 @@ void LU(const Tensor& A, AssertTensorDtypes(A, {Float32, Float64}); // Get output matrix and ipiv. - core::Tensor ipiv, output; + Tensor ipiv, output; LUIpiv(A, ipiv, output); // Decompose output in P, L, U matrix form. diff --git a/cpp/open3d/core/linalg/LUImpl.h b/cpp/open3d/core/linalg/LUImpl.h index 36898fa3de9..fde8f9df385 100644 --- a/cpp/open3d/core/linalg/LUImpl.h +++ b/cpp/open3d/core/linalg/LUImpl.h @@ -21,6 +21,15 @@ void LUCPU(void* A_data, Dtype dtype, const Device& device); +#ifdef BUILD_SYCL_MODULE +void LUSYCL(void* A_data, + void* ipiv_data, + int64_t rows, + int64_t cols, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void LUCUDA(void* A_data, void* ipiv_data, diff --git a/cpp/open3d/core/linalg/LUSYCL.cpp b/cpp/open3d/core/linalg/LUSYCL.cpp new file mode 100644 index 00000000000..9368270c685 --- /dev/null +++ b/cpp/open3d/core/linalg/LUSYCL.cpp @@ -0,0 +1,42 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/LUImpl.h" +#include "open3d/core/linalg/LinalgUtils.h" + +namespace open3d { +namespace core { + +void LUSYCL(void* A_data, + void* ipiv_data, + int64_t m, + int64_t n, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + int64_t lda = m; + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + // Use blob to ensure cleanup of scratchpad memory. + int64_t scratchpad_size = + lapack::getrf_scratchpad_size(queue, m, n, lda); + core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + lapack::getrf(queue, m, n, static_cast(A_data), lda, + static_cast(ipiv_data), + static_cast(scratchpad.GetDataPtr()), + scratchpad_size) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/LeastSquares.cpp b/cpp/open3d/core/linalg/LeastSquares.cpp index 6340331c5ce..46c520215e1 100644 --- a/cpp/open3d/core/linalg/LeastSquares.cpp +++ b/cpp/open3d/core/linalg/LeastSquares.cpp @@ -63,6 +63,12 @@ void LeastSquares(const Tensor &A, const Tensor &B, Tensor &X) { LeastSquaresCUDA(A_data, B_data, m, n, k, dtype, device); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + LeastSquaresSYCL(A_data, B_data, m, n, k, dtype, device); +#else + utility::LogError("Unimplemented device."); #endif } else { LeastSquaresCPU(A_data, B_data, m, n, k, dtype, device); diff --git a/cpp/open3d/core/linalg/LeastSquares.h b/cpp/open3d/core/linalg/LeastSquares.h index c2e79935f5d..aedc33a5a39 100644 --- a/cpp/open3d/core/linalg/LeastSquares.h +++ b/cpp/open3d/core/linalg/LeastSquares.h @@ -25,6 +25,16 @@ void LeastSquaresCUDA(void* A_data, const Device& device); #endif +#ifdef BUILD_SYCL_MODULE +void LeastSquaresSYCL(void* A_data, + void* B_data, + int64_t m, + int64_t n, + int64_t k, + Dtype dtype, + const Device& device); +#endif + void LeastSquaresCPU(void* A_data, void* B_data, int64_t m, diff --git a/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp b/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp new file mode 100644 index 00000000000..faa9dfb3307 --- /dev/null +++ b/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp @@ -0,0 +1,46 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/LeastSquares.h" +#include "open3d/core/linalg/LinalgUtils.h" + +namespace open3d { +namespace core { + +void LeastSquaresSYCL(void* A_data, + void* B_data, + int64_t m, + int64_t n, + int64_t k, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + int nrhs = k, lda = m, stride_a = lda * n, ldb = std::max(m, n), + stride_b = ldb * nrhs, batch_size = 1; + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + // Use blob to ensure cleanup of scratchpad memory. + int64_t scratchpad_size = lapack::gels_batch_scratchpad_size( + queue, transpose::N, m, n, nrhs, lda, stride_a, ldb, stride_b, + batch_size); + core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + lapack::gels_batch( + queue, transpose::N, m, n, nrhs, static_cast(A_data), + lda, stride_a, static_cast(B_data), ldb, stride_b, + batch_size, static_cast(scratchpad.GetDataPtr()), + scratchpad_size) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/Matmul.cpp b/cpp/open3d/core/linalg/Matmul.cpp index 1b616c5f335..012a116b736 100644 --- a/cpp/open3d/core/linalg/Matmul.cpp +++ b/cpp/open3d/core/linalg/Matmul.cpp @@ -65,7 +65,13 @@ void Matmul(const Tensor& A, const Tensor& B, Tensor& output) { output = Tensor::Empty({m, n}, dtype, device); void* C_data = output.GetDataPtr(); - if (device.IsCUDA()) { + if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + MatmulSYCL(B_data, A_data, C_data, n, k, m, dtype, device); +#else + utility::LogError("Unimplemented device."); +#endif + } else if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE CUDAScopedDevice scoped_device(device); MatmulCUDA(B_data, A_data, C_data, n, k, m, dtype, device); diff --git a/cpp/open3d/core/linalg/Matmul.h b/cpp/open3d/core/linalg/Matmul.h index da29240b0c4..eeaba408591 100644 --- a/cpp/open3d/core/linalg/Matmul.h +++ b/cpp/open3d/core/linalg/Matmul.h @@ -15,6 +15,16 @@ namespace core { /// Computes matrix multiplication C = AB. void Matmul(const Tensor& A, const Tensor& B, Tensor& C); +#ifdef BUILD_SYCL_MODULE +void MatmulSYCL(void* A_data, + void* B_data, + void* C_data, + int64_t m, + int64_t k, + int64_t n, + Dtype dtype, + const Device& device); +#endif #ifdef BUILD_CUDA_MODULE void MatmulCUDA(void* A_data, void* B_data, diff --git a/cpp/open3d/core/linalg/MatmulSYCL.cpp b/cpp/open3d/core/linalg/MatmulSYCL.cpp new file mode 100644 index 00000000000..be2d6f94ac7 --- /dev/null +++ b/cpp/open3d/core/linalg/MatmulSYCL.cpp @@ -0,0 +1,39 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/AddMM.h" +#include "open3d/core/linalg/LinalgUtils.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +void MatmulSYCL(void* A_data, + void* B_data, + void* C_data, + int64_t m, + int64_t k, + int64_t n, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + scalar_t alpha = 1, beta = 0; + blas::column_major::gemm(queue, transpose::N, transpose::N, m, n, k, + alpha, static_cast(A_data), m, + static_cast(B_data), k, beta, + static_cast(C_data), m) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/SVD.cpp b/cpp/open3d/core/linalg/SVD.cpp index 935d014ef4e..657e79f9c57 100644 --- a/cpp/open3d/core/linalg/SVD.cpp +++ b/cpp/open3d/core/linalg/SVD.cpp @@ -40,23 +40,31 @@ void SVD(const Tensor &A, Tensor &U, Tensor &S, Tensor &VT) { U = Tensor::Empty({m, m}, dtype, device); S = Tensor::Empty({n}, dtype, device); VT = Tensor::Empty({n, n}, dtype, device); - Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device); void *A_data = A_T.GetDataPtr(); void *U_data = U.GetDataPtr(); void *S_data = S.GetDataPtr(); void *VT_data = VT.GetDataPtr(); - void *superb_data = superb.GetDataPtr(); - if (device.IsCUDA()) { + if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + SVDSYCL(A_data, U_data, S_data, VT_data, m, n, dtype, device); +#else + utility::LogError("Unimplemented device."); +#endif + } else if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE CUDAScopedDevice scoped_device(device); + Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device); + void *superb_data = superb.GetDataPtr(); SVDCUDA(A_data, U_data, S_data, VT_data, superb_data, m, n, dtype, device); #else utility::LogError("Unimplemented device."); #endif } else { + Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device); + void *superb_data = superb.GetDataPtr(); SVDCPU(A_data, U_data, S_data, VT_data, superb_data, m, n, dtype, device); } diff --git a/cpp/open3d/core/linalg/SVD.h b/cpp/open3d/core/linalg/SVD.h index 9e6deefa6c1..51d775d7a9d 100644 --- a/cpp/open3d/core/linalg/SVD.h +++ b/cpp/open3d/core/linalg/SVD.h @@ -16,6 +16,17 @@ namespace core { /// is a min(m, n), VT is an n x n tensor. void SVD(const Tensor& A, Tensor& U, Tensor& S, Tensor& VT); +#ifdef BUILD_SYCL_MODULE +void SVDSYCL(const void* A_data, + void* U_data, + void* S_data, + void* VT_data, + int64_t m, + int64_t n, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void SVDCUDA(const void* A_data, void* U_data, diff --git a/cpp/open3d/core/linalg/SVDSYCL.cpp b/cpp/open3d/core/linalg/SVDSYCL.cpp new file mode 100644 index 00000000000..27abf31728f --- /dev/null +++ b/cpp/open3d/core/linalg/SVDSYCL.cpp @@ -0,0 +1,48 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/LinalgUtils.h" +#include "open3d/core/linalg/SVD.h" + +namespace open3d { +namespace core { + +void SVDSYCL(const void* A_data, + void* U_data, + void* S_data, + void* VT_data, + int64_t m, + int64_t n, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + int64_t lda = m, ldvt = n, ldu = m; + int64_t scratchpad_size = lapack::gesvd_scratchpad_size( + queue, jobsvd::vectors, jobsvd::vectors, m, n, lda, ldu, ldvt); + // Use blob to ensure cleanup of scratchpad memory. + Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + lapack::gesvd( + queue, jobsvd::vectors, jobsvd::vectors, m, n, + const_cast(static_cast(A_data)), + lda, static_cast(S_data), + static_cast(U_data), ldu, + static_cast(VT_data), ldvt, + static_cast(scratchpad.GetDataPtr()), + scratchpad_size) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/Solve.cpp b/cpp/open3d/core/linalg/Solve.cpp index f1025520fac..ec09e8be3f1 100644 --- a/cpp/open3d/core/linalg/Solve.cpp +++ b/cpp/open3d/core/linalg/Solve.cpp @@ -60,7 +60,16 @@ void Solve(const Tensor &A, const Tensor &B, Tensor &X) { X = B.T().Clone(); void *B_data = X.GetDataPtr(); - if (device.IsCUDA()) { + if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + Tensor ipiv = Tensor::Empty({n}, core::Int64, device); + void *ipiv_data = ipiv.GetDataPtr(); + + SolveSYCL(A_data, B_data, ipiv_data, n, k, dtype, device); +#else + utility::LogError("Unimplemented device."); +#endif + } else if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE CUDAScopedDevice scoped_device(device); Tensor ipiv = Tensor::Empty({n}, core::Int32, device); diff --git a/cpp/open3d/core/linalg/Solve.h b/cpp/open3d/core/linalg/Solve.h index 485de7ef0f2..a299c100ca7 100644 --- a/cpp/open3d/core/linalg/Solve.h +++ b/cpp/open3d/core/linalg/Solve.h @@ -23,6 +23,16 @@ void SolveCPU(void* A_data, Dtype dtype, const Device& device); +#ifdef BUILD_SYCL_MODULE +void SolveSYCL(void* A_data, + void* B_data, + void* ipiv_data, + int64_t n, + int64_t k, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void SolveCUDA(void* A_data, void* B_data, diff --git a/cpp/open3d/core/linalg/SolveSYCL.cpp b/cpp/open3d/core/linalg/SolveSYCL.cpp new file mode 100644 index 00000000000..38d6690ff89 --- /dev/null +++ b/cpp/open3d/core/linalg/SolveSYCL.cpp @@ -0,0 +1,44 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/LinalgUtils.h" +#include "open3d/core/linalg/Solve.h" + +namespace open3d { +namespace core { + +void SolveSYCL(void* A_data, + void* B_data, + void* ipiv_data, + int64_t n, + int64_t k, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + int64_t nrhs = k, lda = n, ldb = n; + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + int64_t scratchpad_size = lapack::gesv_scratchpad_size( + queue, n, nrhs, lda, ldb); + // Use blob to ensure cleanup of scratchpad memory. + Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + lapack::gesv(queue, n, nrhs, static_cast(A_data), lda, + static_cast(ipiv_data), + static_cast(B_data), ldb, + static_cast(scratchpad.GetDataPtr()), + scratchpad_size) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/Tri.cpp b/cpp/open3d/core/linalg/Tri.cpp index 77e72b076a2..b15b864aa6a 100644 --- a/cpp/open3d/core/linalg/Tri.cpp +++ b/cpp/open3d/core/linalg/Tri.cpp @@ -42,6 +42,12 @@ void Triu(const Tensor& A, Tensor& output, const int diagonal) { TriuCUDA(A.Contiguous(), output, diagonal); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TriuSYCL(A.Contiguous(), output, diagonal); +#else + utility::LogError("Unimplemented device."); #endif } else { TriuCPU(A.Contiguous(), output, diagonal); @@ -58,6 +64,12 @@ void Tril(const Tensor& A, Tensor& output, const int diagonal) { TrilCUDA(A.Contiguous(), output, diagonal); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TrilSYCL(A.Contiguous(), output, diagonal); +#else + utility::LogError("Unimplemented device."); #endif } else { TrilCPU(A.Contiguous(), output, diagonal); @@ -75,6 +87,12 @@ void Triul(const Tensor& A, Tensor& upper, Tensor& lower, const int diagonal) { TriulCUDA(A.Contiguous(), upper, lower, diagonal); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TriulSYCL(A.Contiguous(), upper, lower, diagonal); +#else + utility::LogError("Unimplemented device."); #endif } else { TriulCPU(A.Contiguous(), upper, lower, diagonal); diff --git a/cpp/open3d/core/linalg/TriImpl.h b/cpp/open3d/core/linalg/TriImpl.h index 441d9c69c93..db7b204a87c 100644 --- a/cpp/open3d/core/linalg/TriImpl.h +++ b/cpp/open3d/core/linalg/TriImpl.h @@ -22,6 +22,17 @@ void TriulCPU(const Tensor& A, Tensor& lower, const int diagonal = 0); +#ifdef BUILD_SYCL_MODULE +void TriuSYCL(const Tensor& A, Tensor& output, const int diagonal = 0); + +void TrilSYCL(const Tensor& A, Tensor& output, const int diagonal = 0); + +void TriulSYCL(const Tensor& A, + Tensor& upper, + Tensor& lower, + const int diagonal = 0); +#endif + #ifdef BUILD_CUDA_MODULE void TriuCUDA(const Tensor& A, Tensor& output, const int diagonal = 0); diff --git a/cpp/open3d/core/linalg/TriSYCL.cpp b/cpp/open3d/core/linalg/TriSYCL.cpp new file mode 100644 index 00000000000..3d10f99efc3 --- /dev/null +++ b/cpp/open3d/core/linalg/TriSYCL.cpp @@ -0,0 +1,82 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/linalg/TriImpl.h" + +namespace open3d { +namespace core { + +void TriuSYCL(const Tensor &A, Tensor &output, const int diagonal) { + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice()); + DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() { + const scalar_t *A_ptr = static_cast(A.GetDataPtr()); + scalar_t *output_ptr = static_cast(output.GetDataPtr()); + auto rows = static_cast(A.GetShape()[0]), + cols = static_cast(A.GetShape()[1]); + queue.parallel_for({cols, rows}, [=](auto wid) { + const auto wid_1d = wid[1] * cols + wid[0]; + if (static_cast(wid[0]) - static_cast(wid[1]) >= + diagonal) { + output_ptr[wid_1d] = A_ptr[wid_1d]; + } + }).wait_and_throw(); + }); +} + +void TrilSYCL(const Tensor &A, Tensor &output, const int diagonal) { + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice()); + DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() { + const scalar_t *A_ptr = static_cast(A.GetDataPtr()); + scalar_t *output_ptr = static_cast(output.GetDataPtr()); + auto rows = static_cast(A.GetShape()[0]), + cols = static_cast(A.GetShape()[1]); + queue.parallel_for({cols, rows}, [=](auto wid) { + const auto wid_1d = wid[1] * cols + wid[0]; + if (static_cast(wid[0]) - static_cast(wid[1]) <= + diagonal) { + output_ptr[wid_1d] = A_ptr[wid_1d]; + } + }).wait_and_throw(); + }); +} + +void TriulSYCL(const Tensor &A, + Tensor &upper, + Tensor &lower, + const int diagonal) { + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice()); + DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() { + const scalar_t *A_ptr = static_cast(A.GetDataPtr()); + scalar_t *upper_ptr = static_cast(upper.GetDataPtr()); + scalar_t *lower_ptr = static_cast(lower.GetDataPtr()); + auto rows = static_cast(A.GetShape()[0]), + cols = static_cast(A.GetShape()[1]); + queue.parallel_for({cols, rows}, [=](auto wid) { + const auto wid_1d = wid[1] * cols + wid[0]; + if (static_cast(wid[0]) - static_cast(wid[1]) < + diagonal) { + lower_ptr[wid_1d] = A_ptr[wid_1d]; + } else if (static_cast(wid[0]) - + static_cast(wid[1]) > + diagonal) { + upper_ptr[wid_1d] = A_ptr[wid_1d]; + } else { + lower_ptr[wid_1d] = 1; + upper_ptr[wid_1d] = A_ptr[wid_1d]; + } + }).wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt index aa651596b6d..acc3ddc7e33 100644 --- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt +++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt @@ -1,4 +1,6 @@ open3d_ispc_add_library(tgeometry_kernel OBJECT) +set_target_properties(tgeometry_kernel PROPERTIES CXX_VISIBILITY_PRESET "hidden") + target_sources(tgeometry_kernel PRIVATE Image.cpp diff --git a/cpp/open3d/t/geometry/kernel/IPPImage.cpp b/cpp/open3d/t/geometry/kernel/IPPImage.cpp index ba32d1af317..03a884ad022 100644 --- a/cpp/open3d/t/geometry/kernel/IPPImage.cpp +++ b/cpp/open3d/t/geometry/kernel/IPPImage.cpp @@ -20,7 +20,6 @@ #endif #include "open3d/core/Dtype.h" -#include "open3d/core/ParallelFor.h" #include "open3d/core/ShapeUtil.h" #include "open3d/core/Tensor.h" #include "open3d/t/geometry/Image.h" diff --git a/cpp/tests/core/CMakeLists.txt b/cpp/tests/core/CMakeLists.txt index c2839eacbae..e6de49e6ed6 100644 --- a/cpp/tests/core/CMakeLists.txt +++ b/cpp/tests/core/CMakeLists.txt @@ -36,3 +36,11 @@ if (BUILD_ISPC_MODULE) ParallelFor.ispc ) endif() + +# if (BUILD_SYCL_MODULE) +# target_sources(tests PRIVATE +# ParallelForSYCL.cpp +# ) +# set_source_files_properties(ParallelForSYCL.cpp PROPERTIES +# COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64_gen") +# endif() \ No newline at end of file diff --git a/cpp/tests/core/CoreTest.cpp b/cpp/tests/core/CoreTest.cpp index 8f1486ce4c6..320a50fa01f 100644 --- a/cpp/tests/core/CoreTest.cpp +++ b/cpp/tests/core/CoreTest.cpp @@ -16,6 +16,12 @@ #include "open3d/core/SizeVector.h" namespace open3d { +namespace core { +void PrintTo(const Device &device, std::ostream *os) { + *os << device.ToString(); +} +void PrintTo(const Dtype &dtype, std::ostream *os) { *os << dtype.ToString(); } +} // namespace core namespace tests { std::vector PermuteDtypesWithBool::TestCases() { @@ -46,6 +52,7 @@ std::vector PermuteDevices::TestCases() { devices.push_back(cuda_devices[1]); } + utility::LogWarning("Total {} devices.", devices.size()); return devices; } @@ -54,7 +61,7 @@ std::vector PermuteDevicesWithSYCL::TestCases() { std::vector sycl_devices = core::Device::GetAvailableSYCLDevices(); if (!sycl_devices.empty()) { - devices.push_back(sycl_devices[0]); + devices.push_back(sycl_devices[0]); // only the first SYCL device } return devices; } diff --git a/cpp/tests/core/CoreTest.h b/cpp/tests/core/CoreTest.h index ab874b8b485..f8c6f5db273 100644 --- a/cpp/tests/core/CoreTest.h +++ b/cpp/tests/core/CoreTest.h @@ -15,6 +15,10 @@ #include "tests/Tests.h" namespace open3d { +namespace core { +void PrintTo(const Device &device, std::ostream *os); +void PrintTo(const Dtype &dtype, std::ostream *os); +} // namespace core namespace tests { class PermuteDtypesWithBool : public testing::TestWithParam { diff --git a/cpp/tests/core/Linalg.cpp b/cpp/tests/core/Linalg.cpp index 2c8be12323b..9b817ae61f8 100644 --- a/cpp/tests/core/Linalg.cpp +++ b/cpp/tests/core/Linalg.cpp @@ -23,10 +23,11 @@ namespace open3d { namespace tests { -class LinalgPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(Linalg, - LinalgPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class LinalgPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + Linalg, + LinalgPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(LinalgPermuteDevices, Matmul) { const float EPSILON = 1e-8; diff --git a/cpp/tests/core/ParallelForSYCL.cpp b/cpp/tests/core/ParallelForSYCL.cpp new file mode 100644 index 00000000000..ff78804faea --- /dev/null +++ b/cpp/tests/core/ParallelForSYCL.cpp @@ -0,0 +1,65 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/ParallelForSYCL.h" + +#include + +#include "open3d/Macro.h" +#include "open3d/core/Dispatch.h" +#include "open3d/core/Dtype.h" +#include "open3d/core/Tensor.h" +#include "tests/Tests.h" +#include "tests/core/CoreTest.h" + +struct TestIndexerFillKernel { + TestFillKernel(const core::Indexer &indexer_, int64_t multiplier_) + : indexer(indexer_), multiplier(multiplier_) {} + void operator()(int64_t idx) { + indexer.GetOutputPtr(0)[idx] = idx * multiplier; + } + +private: + core::Indexer indexer; + int64_t multiplier; +}; + +struct TestPtrFillKernel { + TestFillKernel(int64_t *out_, int64_t multiplier_) + : out(out_), multiplier(multiplier_) {} + void operator()(int64_t idx) { out[idx] = idx * multiplier; } + +private: + int64_t *out; + int64_t multiplier; +}; + +TEST(ParallelForSYCL, FunctorSYCL) { + const core::Device device("SYCL:0"); + const size_t N = 10000000; + core::Indexer indexer({}, tensor, DtypePolicy::NONE); + int64_t multiplier = 2; + + { + core::Tensor tensor({N, 1}, core::Int64, device); + core::ParallelForSYCL(device, indexer, + multiplier); + auto result = tensor.To(core::Device()).GetDataPtr(); + for (int64_t i = 0; i < tensor.NumElements(); ++i) { + ASSERT_EQ(result[i], i * multiplier); + } + } + { + core::Tensor tensor({N, 1}, core::Int64, device); + core::ParallelForSYCL( + device, N, tensor.GetDataPtr(), multiplier); + auto result = tensor.To(core::Device()).GetDataPtr(); + for (int64_t i = 0; i < tensor.NumElements(); ++i) { + ASSERT_EQ(result[i], i * multiplier); + } + } +} \ No newline at end of file diff --git a/cpp/tests/core/Tensor.cpp b/cpp/tests/core/Tensor.cpp index 9f788116be7..0106c5ec6ec 100644 --- a/cpp/tests/core/Tensor.cpp +++ b/cpp/tests/core/Tensor.cpp @@ -95,7 +95,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ConstructorBool) { EXPECT_EQ(t.GetDtype(), dtype); } -TEST_P(TensorPermuteDevices, WithInitValue) { +TEST_P(TensorPermuteDevicesWithSYCL, WithInitValue) { core::Device device = GetParam(); std::vector vals{0, 1, 2, 3, 4, 5}; @@ -223,7 +223,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, WithInitValueSizeMismatch) { std::runtime_error); } -TEST_P(TensorPermuteDevices, Arange) { +TEST_P(TensorPermuteDevicesWithSYCL, Arange) { core::Device device = GetParam(); core::Tensor arange; @@ -266,28 +266,28 @@ TEST_P(TensorPermuteDevices, Arange) { std::runtime_error); } -TEST_P(TensorPermuteDevices, Fill) { +TEST_P(TensorPermuteDevicesWithSYCL, Fill) { core::Device device = GetParam(); core::Tensor t(std::vector(2 * 3, 0), {2, 3}, core::Float32, device); t.Fill(1); EXPECT_EQ(t.ToFlatVector(), std::vector({1, 1, 1, 1, 1, 1})); } -TEST_P(TensorPermuteDevices, FillBool) { +TEST_P(TensorPermuteDevicesWithSYCL, FillBool) { core::Device device = GetParam(); core::Tensor t(std::vector(2 * 3, false), {2, 3}, core::Bool, device); t.Fill(true); EXPECT_EQ(t.ToFlatVector(), std::vector(2 * 3, true)); } -TEST_P(TensorPermuteDevices, FillSlice) { +TEST_P(TensorPermuteDevicesWithSYCL, FillSlice) { core::Device device = GetParam(); core::Tensor t(std::vector(2 * 3, 0), {2, 3}, core::Float32, device); t.Slice(1, 0, 3, 2).Fill(1); // t[:, 0:3:2].fill(1) EXPECT_EQ(t.ToFlatVector(), std::vector({1, 0, 1, 1, 0, 1})); } -TEST_P(TensorPermuteDevicePairs, IndexSetFillFancy) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSetFillFancy) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -348,7 +348,7 @@ TEST_P(TensorPermuteDevicePairsWithSYCL, CopyBool) { EXPECT_EQ(dst_t.ToFlatVector(), vals); } -TEST_P(TensorPermuteDevices, To) { +TEST_P(TensorPermuteDevicesWithSYCL, To) { core::Device device = GetParam(); core::SizeVector shape{2, 3}; @@ -364,7 +364,7 @@ TEST_P(TensorPermuteDevices, To) { EXPECT_EQ(dst_t.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevicePairs, ToDevice) { +TEST_P(TensorPermuteDevicePairsWithSYCL, ToDevice) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -382,7 +382,7 @@ TEST_P(TensorPermuteDevicePairs, ToDevice) { EXPECT_ANY_THROW(src_t.To(core::Device("CUDA:100000"))); } -TEST_P(TensorPermuteDevicePairs, CopyBroadcast) { +TEST_P(TensorPermuteDevicePairsWithSYCL, CopyBroadcast) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -403,7 +403,7 @@ TEST_P(TensorPermuteDevicePairs, CopyBroadcast) { EXPECT_EQ(dst_t.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Expand) { +TEST_P(TensorPermuteDevicesWithSYCL, Expand) { core::Device device = GetParam(); core::Dtype dtype(core::Float32); @@ -423,7 +423,7 @@ TEST_P(TensorPermuteDevices, Expand) { EXPECT_EQ(dst_t.GetDataPtr(), src_t.GetDataPtr()); } -TEST_P(TensorPermuteDevices, Flatten) { +TEST_P(TensorPermuteDevicesWithSYCL, Flatten) { core::Device device = GetParam(); // Flatten 0-D Tensor. @@ -562,7 +562,7 @@ TEST_P(TensorPermuteSizesDefaultStridesAndDevices, DefaultStrides) { EXPECT_EQ(t.GetStrides(), expected_strides); } -TEST_P(TensorPermuteDevices, OperatorSquareBrackets) { +TEST_P(TensorPermuteDevicesWithSYCL, OperatorSquareBrackets) { core::Device device = GetParam(); // Zero dim @@ -623,7 +623,7 @@ TEST_P(TensorPermuteDevices, OperatorSquareBrackets) { EXPECT_EQ(t_1_2_3.GetBlob(), t.GetBlob()); } -TEST_P(TensorPermuteDevices, Item) { +TEST_P(TensorPermuteDevicesWithSYCL, Item) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -645,7 +645,7 @@ TEST_P(TensorPermuteDevices, Item) { EXPECT_EQ(t_1_2_3.Item(), 23.f); } -TEST_P(TensorPermuteDevices, ItemBool) { +TEST_P(TensorPermuteDevicesWithSYCL, ItemBool) { core::Device device = GetParam(); std::vector vals{true, true, false}; @@ -660,7 +660,7 @@ TEST_P(TensorPermuteDevices, ItemBool) { EXPECT_EQ(t[2].Item(), false); } -TEST_P(TensorPermuteDevices, ItemAssign) { +TEST_P(TensorPermuteDevicesWithSYCL, ItemAssign) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( {{{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}}, @@ -782,7 +782,7 @@ TEST_P(TensorPermuteDevicePairsWithSYCL, CopyContiguous) { t_1_copy.GetBlob()->GetDataPtr()); // Points to beginning of Blob } -TEST_P(TensorPermuteDevices, Slice) { +TEST_P(TensorPermuteDevicesWithSYCL, Slice) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -839,7 +839,7 @@ TEST_P(TensorPermuteDevices, Slice) { EXPECT_EQ(t_5.ToFlatVector(), std::vector({12, 16})); } -TEST_P(TensorPermuteDevices, GetItem) { +TEST_P(TensorPermuteDevicesWithSYCL, GetItem) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -858,7 +858,7 @@ TEST_P(TensorPermuteDevices, GetItem) { std::vector({12, 14, 16, 18, 20, 22})); } -TEST_P(TensorPermuteDevices, GetItemAdvancedIndexing) { +TEST_P(TensorPermuteDevicesWithSYCL, GetItemAdvancedIndexing) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, @@ -874,7 +874,7 @@ TEST_P(TensorPermuteDevices, GetItemAdvancedIndexing) { std::vector({0, 1, 1, 2, 3, 5, 8, 13, 21})); } -TEST_P(TensorPermuteDevices, GetItemAdvancedIndexingMixed) { +TEST_P(TensorPermuteDevicesWithSYCL, GetItemAdvancedIndexingMixed) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -894,7 +894,7 @@ TEST_P(TensorPermuteDevices, GetItemAdvancedIndexingMixed) { EXPECT_EQ(t_1.ToFlatVector(), std::vector({13, 17, 14, 18})); } -TEST_P(TensorPermuteDevices, SetItemAdvancedIndexing) { +TEST_P(TensorPermuteDevicesWithSYCL, SetItemAdvancedIndexing) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -914,7 +914,7 @@ TEST_P(TensorPermuteDevices, SetItemAdvancedIndexing) { 16, 17, 18, 19, 20, 21, 22, 23})); } -TEST_P(TensorPermuteDevices, SetItemAdvancedIndexingMixed) { +TEST_P(TensorPermuteDevicesWithSYCL, SetItemAdvancedIndexingMixed) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -937,7 +937,7 @@ TEST_P(TensorPermuteDevices, SetItemAdvancedIndexingMixed) { 16, 200, 400, 19, 20, 21, 22, 23})); } -TEST_P(TensorPermuteDevices, SliceAssign) { +TEST_P(TensorPermuteDevicesWithSYCL, SliceAssign) { core::Device device = GetParam(); core::Tensor dst = core::Tensor::Init( @@ -991,7 +991,7 @@ TEST_P(TensorPermuteDevices, SliceAssign) { 16, 17, 18, 19, 203, 21, 223, 23})); } -TEST_P(TensorPermuteDevices, Append) { +TEST_P(TensorPermuteDevicesWithSYCL, Append) { core::Device device = GetParam(); core::Tensor self, other, output; @@ -1090,7 +1090,7 @@ TEST_P(TensorPermuteDevices, Append) { } } -TEST_P(TensorPermuteDevicePairs, CopyNonContiguous) { +TEST_P(TensorPermuteDevicePairsWithSYCL, CopyNonContiguous) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -1127,7 +1127,7 @@ TEST_P(TensorPermuteDevicePairs, CopyNonContiguous) { } } -TEST_P(TensorPermuteDevicePairs, IndexGet) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGet) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1164,7 +1164,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGet) { EXPECT_EQ(src_t.GetDtype(), dst_t.GetDtype()); } -TEST_P(TensorPermuteDevicePairs, IndexGetNegative) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetNegative) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1188,7 +1188,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetNegative) { EXPECT_EQ(t_1.ToFlatVector(), std::vector({5, 10, 17, 22})); } -TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndex) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGet2DBroadcastedIndex) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1218,7 +1218,8 @@ TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndex) { 28, 29, 30, 31, 40, 41, 42, 43})); } -TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndexSplitBySlice) { +TEST_P(TensorPermuteDevicePairsWithSYCL, + IndexGet2DBroadcastedIndexSplitBySlice) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1249,7 +1250,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndexSplitBySlice) { 16, 20, 40, 44, 17, 21, 41, 45})); } -TEST_P(TensorPermuteDevicePairs, IndexGetAssignToBroadcast) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetAssignToBroadcast) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -1279,7 +1280,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetAssignToBroadcast) { std::vector({5, 10, 17, 22, 5, 10, 17, 22, 5, 10, 17, 22})); } -TEST_P(TensorPermuteDevicePairs, IndexGetSeparateBySlice) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetSeparateBySlice) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1303,7 +1304,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetSeparateBySlice) { std::vector({0, 4, 8, 13, 17, 21})); } -TEST_P(TensorPermuteDevicePairs, IndexGetSliceEnd) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetSliceEnd) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1326,7 +1327,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetSliceEnd) { std::vector({0, 1, 2, 3, 16, 17, 18, 19})); } -TEST_P(TensorPermuteDevicePairs, IndexSet) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSet) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -1405,7 +1406,7 @@ TEST_P(TensorPermuteDevicePairs, IndexSet) { core::Tensor::Init({10, 11}, src_device))); } -TEST_P(TensorPermuteDevicePairs, IndexSetBroadcast) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSetBroadcast) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -1430,7 +1431,7 @@ TEST_P(TensorPermuteDevicePairs, IndexSetBroadcast) { 0, 0, 0, 0, 20, 20, 20, 0, 0, 0, 0, 0})); } -TEST_P(TensorPermuteDevices, IndexAdd_) { +TEST_P(TensorPermuteDevicesWithSYCL, IndexAdd_) { core::Device device = GetParam(); const int tensor_size = 100; @@ -1464,7 +1465,7 @@ TEST_P(TensorPermuteDevices, IndexAdd_) { } } -TEST_P(TensorPermuteDevices, Permute) { +TEST_P(TensorPermuteDevicesWithSYCL, Permute) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -1492,7 +1493,7 @@ TEST_P(TensorPermuteDevices, Permute) { 17, 21, 14, 18, 22, 15, 19, 23})); } -TEST_P(TensorPermuteDevices, Transpose) { +TEST_P(TensorPermuteDevicesWithSYCL, Transpose) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -1512,7 +1513,7 @@ TEST_P(TensorPermuteDevices, Transpose) { EXPECT_THROW(t.Transpose(3, 5), std::runtime_error); } -TEST_P(TensorPermuteDevices, T) { +TEST_P(TensorPermuteDevicesWithSYCL, T) { core::Device device = GetParam(); std::vector vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, @@ -1556,7 +1557,7 @@ TEST_P(TensorPermuteDevices, Det) { EXPECT_ANY_THROW(core::Tensor::Ones({3, 4}, dtype, device).Det()); } -TEST_P(TensorPermuteDevices, ShallowCopyConstructor) { +TEST_P(TensorPermuteDevicesWithSYCL, ShallowCopyConstructor) { core::Device device = GetParam(); core::Tensor t({2, 3}, core::Float32, device); @@ -1579,7 +1580,7 @@ TEST_P(TensorPermuteDevices, ShallowCopyConstructor) { EXPECT_EQ(t.GetDataPtr(), FirstTensorDataPtr({t})); } -TEST_P(TensorPermuteDevices, AdvancedIndexing_IsIndexSplittedBySlice) { +TEST_P(TensorPermuteDevicesWithSYCL, AdvancedIndexing_IsIndexSplittedBySlice) { core::Device device = GetParam(); core::Tensor idx = core::Tensor::Init({1, 2}, device); @@ -1602,7 +1603,7 @@ TEST_P(TensorPermuteDevices, AdvancedIndexing_IsIndexSplittedBySlice) { {idx, slice, slice, idx})); } -TEST_P(TensorPermuteDevices, Add) { +TEST_P(TensorPermuteDevicesWithSYCL, Add) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1, 2}, {3, 4, 5}}, device); core::Tensor b = @@ -1612,7 +1613,7 @@ TEST_P(TensorPermuteDevices, Add) { std::vector({10, 12, 14, 16, 18, 20})); } -TEST_P(TensorPermuteDevices, Add_) { +TEST_P(TensorPermuteDevicesWithSYCL, Add_) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1, 2}, {3, 4, 5}}, device); core::Tensor b = @@ -1622,7 +1623,7 @@ TEST_P(TensorPermuteDevices, Add_) { std::vector({10, 12, 14, 16, 18, 20})); } -TEST_P(TensorPermuteDevices, Add_BroadcastException) { +TEST_P(TensorPermuteDevicesWithSYCL, Add_BroadcastException) { // A.shape = ( 3, 4) // B.shape = (2, 3, 4) // A += B should throw exception. @@ -1642,7 +1643,7 @@ TEST_P(TensorPermuteDevices, Add_BroadcastException) { 20, 22, 24, 26, 28, 30, 32, 34})); } -TEST_P(TensorPermuteDevices, Sub) { +TEST_P(TensorPermuteDevicesWithSYCL, Sub) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{10, 12, 14}, {16, 18, 20}}, device); @@ -1652,7 +1653,7 @@ TEST_P(TensorPermuteDevices, Sub) { std::vector({10, 11, 12, 13, 14, 15})); } -TEST_P(TensorPermuteDevices, Sub_) { +TEST_P(TensorPermuteDevicesWithSYCL, Sub_) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{10, 12, 14}, {16, 18, 20}}, device); @@ -1662,7 +1663,7 @@ TEST_P(TensorPermuteDevices, Sub_) { std::vector({10, 11, 12, 13, 14, 15})); } -TEST_P(TensorPermuteDevices, Mul) { +TEST_P(TensorPermuteDevicesWithSYCL, Mul) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1, 2}, {3, 4, 5}}, device); core::Tensor b = @@ -1672,7 +1673,7 @@ TEST_P(TensorPermuteDevices, Mul) { std::vector({0, 7, 16, 27, 40, 55})); } -TEST_P(TensorPermuteDevices, Mul_) { +TEST_P(TensorPermuteDevicesWithSYCL, Mul_) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1, 2}, {3, 4, 5}}, device); core::Tensor b = @@ -1682,27 +1683,31 @@ TEST_P(TensorPermuteDevices, Mul_) { std::vector({0, 7, 16, 27, 40, 55})); } -TEST_P(TensorPermuteDevices, Div) { +TEST_P(TensorPermuteDevicesWithSYCL, Div) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 7, 16}, {27, 40, 55}}, device); core::Tensor b = core::Tensor::Init({{6, 7, 8}, {9, 10, 11}}, device); core::Tensor c = a / b; - EXPECT_EQ(c.ToFlatVector(), std::vector({0, 1, 2, 3, 4, 5})); + core::Tensor c_ref = core::Tensor(std::vector{0, 1, 2, 3, 4, 5}, + {2, 3}, core::Float32, device); + EXPECT_TRUE(c.AllClose(c_ref)); } -TEST_P(TensorPermuteDevices, Div_) { +TEST_P(TensorPermuteDevicesWithSYCL, Div_) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 7, 16}, {27, 40, 55}}, device); core::Tensor b = core::Tensor::Init({{6, 7, 8}, {9, 10, 11}}, device); a /= b; - EXPECT_EQ(a.ToFlatVector(), std::vector({0, 1, 2, 3, 4, 5})); + core::Tensor a_ref = core::Tensor(std::vector{0, 1, 2, 3, 4, 5}, + {2, 3}, core::Float32, device); + EXPECT_TRUE(a.AllClose(a_ref)); } -TEST_P(TensorPermuteDevices, ReduceSumKeepDim) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumKeepDim) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -1774,7 +1779,7 @@ TEST_P(TensorPermuteDevices, ReduceSumKeepDim) { EXPECT_THROW(src.Sum({2, -1}, true), std::runtime_error); // Repeated. } -TEST_P(TensorPermuteDevices, ReduceSumNotKeepDim) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumNotKeepDim) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -1829,7 +1834,7 @@ TEST_P(TensorPermuteDevices, ReduceSumNotKeepDim) { EXPECT_EQ(dst.ToFlatVector(), std::vector({276.f})); } -TEST_P(TensorPermuteDevices, ReduceSumSpecialShapes) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumSpecialShapes) { core::Device device = GetParam(); core::Tensor src; core::Tensor dst; @@ -1908,7 +1913,7 @@ TEST_P(TensorPermuteDevices, ReduceSumSpecialShapes) { EXPECT_EQ(dst.ToFlatVector(), std::vector({0})); } -TEST_P(TensorPermuteDevices, ReduceMultipleOutputsSumLargeArray) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceMultipleOutputsSumLargeArray) { core::Device device = GetParam(); core::SizeVector shape{3, 7, 8234719}; int64_t size = shape.NumElements(); @@ -1925,7 +1930,7 @@ TEST_P(TensorPermuteDevices, ReduceMultipleOutputsSumLargeArray) { EXPECT_EQ(dst.ToFlatVector(), std::vector(7 * 8234719, 3)); } -TEST_P(TensorPermuteDevices, ReduceSum64bit1D) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit1D) { core::Device device = GetParam(); // num_bytes = 8 * (2 ^ 28) + 1 = 2 ^ 31 + 1 ~= 2GB // max_offsets = num_bytes - 1 = 2 ^ 31 @@ -1943,7 +1948,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit1D) { } // np.sum(np.ones((2, large_dim)), dim=0) -TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase0) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase0) { core::Device device = GetParam(); int64_t large_dim = (1ULL << 27) + 10; core::SizeVector shape{2, large_dim}; @@ -1967,7 +1972,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase0) { } // np.sum(np.ones((2, large_dim)), dim=1) -TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase1) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase1) { core::Device device = GetParam(); int64_t large_dim = (1ULL << 27) + 10; core::SizeVector shape{2, large_dim}; @@ -1991,7 +1996,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase1) { } // np.sum(np.ones((large_dim, 2)), dim=0) -TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase2) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase2) { core::Device device = GetParam(); int64_t large_dim = (1ULL << 27) + 10; core::SizeVector shape{large_dim, 2}; @@ -2015,7 +2020,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase2) { } // np.sum(np.ones((large_dim, 2)), dim=1) -TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase3) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase3) { core::Device device = GetParam(); int64_t large_dim = (1ULL << 27) + 10; core::SizeVector shape{large_dim, 2}; @@ -2038,7 +2043,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase3) { std::vector(large_dim - 30, 2)); } -TEST_P(TensorPermuteDevices, ReduceSumLargeArray) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumLargeArray) { core::Device device = GetParam(); std::vector sizes = TensorSizes::TestCases(); @@ -2060,7 +2065,7 @@ TEST_P(TensorPermuteDevices, ReduceSumLargeArray) { } } -TEST_P(TensorPermuteDevices, ReduceProd) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceProd) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -2117,7 +2122,7 @@ TEST_P(TensorPermuteDevices, ReduceProd) { EXPECT_EQ(dst.ToFlatVector(), std::vector({0.f})); } -TEST_P(TensorPermuteDevices, ReduceMin) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceMin) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -2170,7 +2175,7 @@ TEST_P(TensorPermuteDevices, ReduceMin) { EXPECT_EQ(dst.ToFlatVector(), std::vector({0.f})); } -TEST_P(TensorPermuteDevices, ReduceMax) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceMax) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -2293,7 +2298,7 @@ TEST_P(TensorPermuteDevices, ReduceArgMax) { std::vector({1, 2, 2, 1, 3, 2})); } -TEST_P(TensorPermuteDevices, Sqrt) { +TEST_P(TensorPermuteDevicesWithSYCL, Sqrt) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{0, 1, 4}, {9, 16, 25}}, device); @@ -2323,7 +2328,7 @@ TEST_P(TensorPermuteDevices, Sqrt) { std::vector({0, 1, 2, 3, 4, 5})); } -TEST_P(TensorPermuteDevices, Sin) { +TEST_P(TensorPermuteDevicesWithSYCL, Sin) { core::Device device = GetParam(); std::vector src_vals{-2, -1, 0, 1, 2, 3}; @@ -2346,7 +2351,7 @@ TEST_P(TensorPermuteDevices, Sin) { EXPECT_THROW(src.Sin(), std::runtime_error); } -TEST_P(TensorPermuteDevices, Cos) { +TEST_P(TensorPermuteDevicesWithSYCL, Cos) { core::Device device = GetParam(); std::vector src_vals{-2, -1, 0, 1, 2, 3}; @@ -2369,7 +2374,7 @@ TEST_P(TensorPermuteDevices, Cos) { EXPECT_THROW(src.Cos(), std::runtime_error); } -TEST_P(TensorPermuteDevices, Neg) { +TEST_P(TensorPermuteDevicesWithSYCL, Neg) { core::Device device = GetParam(); std::vector dst_vals{2, 1, 0, -1, -2, -3}; @@ -2388,7 +2393,7 @@ TEST_P(TensorPermuteDevices, Neg) { EXPECT_EQ(dst.ToFlatVector(), std::vector({1, 0, -2})); } -TEST_P(TensorPermuteDevices, UnaryMinus) { +TEST_P(TensorPermuteDevicesWithSYCL, UnaryMinus) { core::Device device = GetParam(); std::vector dst_vals{2, 1, 0, -1, -2, -3}; @@ -2403,7 +2408,7 @@ TEST_P(TensorPermuteDevices, UnaryMinus) { EXPECT_EQ(dst.ToFlatVector(), std::vector({1, 0, -2})); } -TEST_P(TensorPermuteDevices, Exp) { +TEST_P(TensorPermuteDevicesWithSYCL, Exp) { core::Device device = GetParam(); std::vector src_vals{-2, -1, 0, 1, 2, 3}; @@ -2426,7 +2431,7 @@ TEST_P(TensorPermuteDevices, Exp) { EXPECT_THROW(src.Exp(), std::runtime_error); } -TEST_P(TensorPermuteDevices, Abs) { +TEST_P(TensorPermuteDevicesWithSYCL, Abs) { core::Device device = GetParam(); std::vector src_vals{-2, -1, 0, 1, 2, 3}; @@ -2444,7 +2449,7 @@ TEST_P(TensorPermuteDevices, Abs) { EXPECT_EQ(src.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, IsNan) { +TEST_P(TensorPermuteDevicesWithSYCL, IsNan) { core::Device device = GetParam(); std::vector src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY}; @@ -2458,7 +2463,7 @@ TEST_P(TensorPermuteDevices, IsNan) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, IsInf) { +TEST_P(TensorPermuteDevicesWithSYCL, IsInf) { core::Device device = GetParam(); std::vector src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY}; @@ -2472,7 +2477,7 @@ TEST_P(TensorPermuteDevices, IsInf) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, IsFinite) { +TEST_P(TensorPermuteDevicesWithSYCL, IsFinite) { core::Device device = GetParam(); std::vector src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY}; @@ -2486,7 +2491,7 @@ TEST_P(TensorPermuteDevices, IsFinite) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Floor) { +TEST_P(TensorPermuteDevicesWithSYCL, Floor) { core::Device device = GetParam(); std::vector src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5}; @@ -2500,7 +2505,7 @@ TEST_P(TensorPermuteDevices, Floor) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Ceil) { +TEST_P(TensorPermuteDevicesWithSYCL, Ceil) { core::Device device = GetParam(); std::vector src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5}; @@ -2514,7 +2519,7 @@ TEST_P(TensorPermuteDevices, Ceil) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Round) { +TEST_P(TensorPermuteDevicesWithSYCL, Round) { core::Device device = GetParam(); std::vector src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5}; @@ -2528,7 +2533,7 @@ TEST_P(TensorPermuteDevices, Round) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Trunc) { +TEST_P(TensorPermuteDevicesWithSYCL, Trunc) { core::Device device = GetParam(); std::vector src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5}; @@ -2542,7 +2547,7 @@ TEST_P(TensorPermuteDevices, Trunc) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, LogicalNot) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalNot) { core::Device device = GetParam(); std::vector src_vals{true, false, true, false}; @@ -2560,7 +2565,7 @@ TEST_P(TensorPermuteDevices, LogicalNot) { EXPECT_EQ(src.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, LogicalNotFloat) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalNotFloat) { core::Device device = GetParam(); std::vector src_vals{0, -1, 1, 2}; @@ -2584,7 +2589,7 @@ TEST_P(TensorPermuteDevices, LogicalNotFloat) { EXPECT_EQ(src.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, LogicalAnd) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalAnd) { core::Device device = GetParam(); core::Tensor a(std::vector({true, false, true, false}), {2, 2}, core::Bool, device); @@ -2603,7 +2608,7 @@ TEST_P(TensorPermuteDevices, LogicalAnd) { std::vector({true, false, false, false})); } -TEST_P(TensorPermuteDevices, LogicalAndFloat) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalAndFloat) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{-1, 0}, {1, 0}}, device); core::Tensor b = core::Tensor::Init({{1, 0}, {0, 0}}, device); @@ -2616,7 +2621,7 @@ TEST_P(TensorPermuteDevices, LogicalAndFloat) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 0, 0, 0})); } -TEST_P(TensorPermuteDevices, LogicalOr) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalOr) { core::Device device = GetParam(); core::Tensor a(std::vector({true, false, true, false}), {2, 2}, core::Bool, device); @@ -2635,7 +2640,7 @@ TEST_P(TensorPermuteDevices, LogicalOr) { std::vector({true, true, true, false})); } -TEST_P(TensorPermuteDevices, LogicalOrFloat) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalOrFloat) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{-1, 0}, {1, 0}}, device); core::Tensor b = core::Tensor::Init({{1, -1}, {0, 0}}, device); @@ -2648,7 +2653,7 @@ TEST_P(TensorPermuteDevices, LogicalOrFloat) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 1, 1, 0})); } -TEST_P(TensorPermuteDevices, LogicalXor) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalXor) { core::Device device = GetParam(); core::Tensor a(std::vector({true, false, true, false}), {2, 2}, core::Bool, device); @@ -2664,7 +2669,7 @@ TEST_P(TensorPermuteDevices, LogicalXor) { std::vector({false, true, true, false})); } -TEST_P(TensorPermuteDevices, LogicalXorFloat) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalXorFloat) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{-1, 0}, {1, 0}}, device); core::Tensor b = core::Tensor::Init({{1, -1}, {0, 0}}, device); @@ -2677,7 +2682,7 @@ TEST_P(TensorPermuteDevices, LogicalXorFloat) { EXPECT_EQ(a.ToFlatVector(), std::vector({0, 1, 1, 0})); } -TEST_P(TensorPermuteDevices, Gt) { +TEST_P(TensorPermuteDevicesWithSYCL, Gt) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2693,7 +2698,7 @@ TEST_P(TensorPermuteDevices, Gt) { EXPECT_EQ(a.ToFlatVector(), std::vector({0, 1, 0, 0})); } -TEST_P(TensorPermuteDevices, Lt) { +TEST_P(TensorPermuteDevicesWithSYCL, Lt) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2709,7 +2714,7 @@ TEST_P(TensorPermuteDevices, Lt) { EXPECT_EQ(a.ToFlatVector(), std::vector({0, 0, 1, 1})); } -TEST_P(TensorPermuteDevices, Ge) { +TEST_P(TensorPermuteDevicesWithSYCL, Ge) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2725,7 +2730,7 @@ TEST_P(TensorPermuteDevices, Ge) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 1, 0, 0})); } -TEST_P(TensorPermuteDevices, Le) { +TEST_P(TensorPermuteDevicesWithSYCL, Le) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2741,7 +2746,7 @@ TEST_P(TensorPermuteDevices, Le) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 0, 1, 1})); } -TEST_P(TensorPermuteDevices, Eq) { +TEST_P(TensorPermuteDevicesWithSYCL, Eq) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2757,7 +2762,7 @@ TEST_P(TensorPermuteDevices, Eq) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 0, 0, 0})); } -TEST_P(TensorPermuteDevices, Ne) { +TEST_P(TensorPermuteDevicesWithSYCL, Ne) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); @@ -2774,7 +2779,7 @@ TEST_P(TensorPermuteDevices, Ne) { EXPECT_EQ(a.ToFlatVector(), std::vector({0, 1, 1, 1})); } -TEST_P(TensorPermuteDevices, BooleanIndex) { +TEST_P(TensorPermuteDevicesWithSYCL, BooleanIndex) { core::Device device = GetParam(); // a[a < 0] = 0 @@ -2798,7 +2803,7 @@ TEST_P(TensorPermuteDevices, BooleanIndex) { EXPECT_EQ(y.GetDtype(), core::Float32); } -TEST_P(TensorPermuteDevices, NonZeroNumpy) { +TEST_P(TensorPermuteDevicesWithSYCL, NonZeroNumpy) { core::Device device = GetParam(); core::Tensor a = @@ -2812,7 +2817,7 @@ TEST_P(TensorPermuteDevices, NonZeroNumpy) { EXPECT_EQ(results[1].GetShape(), core::SizeVector{3}); } -TEST_P(TensorPermuteDevices, All) { +TEST_P(TensorPermuteDevicesWithSYCL, All) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( {{false, true}, {true, false}, {true, false}, {true, true}}, @@ -2843,7 +2848,7 @@ TEST_P(TensorPermuteDevices, All) { EXPECT_ANY_THROW(t.All(core::SizeVector({2}))); } -TEST_P(TensorPermuteDevices, Any) { +TEST_P(TensorPermuteDevicesWithSYCL, Any) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( {{false, true}, {true, false}, {true, false}, {true, true}}, @@ -2875,7 +2880,7 @@ TEST_P(TensorPermuteDevices, Any) { EXPECT_ANY_THROW(t.Any(core::SizeVector({2}))); } -TEST_P(TensorPermuteDevices, CreationEmpty) { +TEST_P(TensorPermuteDevicesWithSYCL, CreationEmpty) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Empty({}, core::Float32, device); @@ -2899,7 +2904,7 @@ TEST_P(TensorPermuteDevices, CreationEmpty) { EXPECT_EQ(a.NumElements(), 6); } -TEST_P(TensorPermuteDevices, CreationFull) { +TEST_P(TensorPermuteDevicesWithSYCL, CreationFull) { core::Device device = GetParam(); const float fill_value = 100; @@ -2934,7 +2939,7 @@ TEST_P(TensorPermuteDevices, CreationFull) { std::vector(a.NumElements(), fill_value)); } -TEST_P(TensorPermuteDevices, CreationZeros) { +TEST_P(TensorPermuteDevicesWithSYCL, CreationZeros) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Zeros({2, 3}, core::Float32, device); @@ -2943,7 +2948,7 @@ TEST_P(TensorPermuteDevices, CreationZeros) { EXPECT_EQ(a.ToFlatVector(), std::vector(a.NumElements(), 0)); } -TEST_P(TensorPermuteDevices, CreationOnes) { +TEST_P(TensorPermuteDevicesWithSYCL, CreationOnes) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Ones({2, 3}, core::Float32, device); @@ -2952,7 +2957,7 @@ TEST_P(TensorPermuteDevices, CreationOnes) { EXPECT_EQ(a.ToFlatVector(), std::vector(a.NumElements(), 1)); } -TEST_P(TensorPermuteDevices, ScalarOperatorOverload) { +TEST_P(TensorPermuteDevicesWithSYCL, ScalarOperatorOverload) { core::Device device = GetParam(); core::Tensor a; core::Tensor b; @@ -3038,7 +3043,7 @@ TEST_P(TensorPermuteDevices, ScalarOperatorOverload) { EXPECT_EQ(a.ToFlatVector(), std::vector({5, 5})); } -TEST_P(TensorPermuteDevices, ReduceMean) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceMean) { core::Device device = GetParam(); core::Tensor src; core::Tensor dst; @@ -3129,7 +3134,7 @@ TEST_P(TensorPermuteDevices, ReduceMean) { EXPECT_TRUE(std::isnan(dst.ToFlatVector()[0])); } -TEST_P(TensorPermuteDevices, ToDLPackFromDLPack) { +TEST_P(TensorPermuteDevicesWithSYCL, ToDLPackFromDLPack) { core::Device device = GetParam(); core::Tensor src_t = core::Tensor::Init( {{{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}}, @@ -3162,7 +3167,7 @@ TEST_P(TensorPermuteDevices, ToDLPackFromDLPack) { std::vector({12, 14, 20, 22})); } -TEST_P(TensorPermuteDevices, IsSame) { +TEST_P(TensorPermuteDevicesWithSYCL, IsSame) { core::Device device = GetParam(); // "Shallow" copy. @@ -3206,7 +3211,7 @@ TEST_P(TensorPermuteDevices, IsSame) { EXPECT_TRUE(vec[0].IsSame(vec[1])); } -TEST_P(TensorPermuteDevices, RValueScalar) { +TEST_P(TensorPermuteDevicesWithSYCL, RValueScalar) { const core::Device &device = GetParam(); core::Tensor t, t_ref; @@ -3271,7 +3276,7 @@ TEST_P(TensorPermuteDevices, RValueScalar) { EXPECT_TRUE(t.AllClose(t_ref)); } -TEST_P(TensorPermuteDevices, Clip) { +TEST_P(TensorPermuteDevicesWithSYCL, Clip) { core::Device device = GetParam(); core::Tensor t, t_clip, t_ref; @@ -3324,7 +3329,7 @@ TEST_P(TensorPermuteDevices, Clip) { EXPECT_TRUE(t_clip.AllClose(t_ref)); } -TEST_P(TensorPermuteDevices, Clip_) { +TEST_P(TensorPermuteDevicesWithSYCL, Clip_) { core::Device device = GetParam(); core::Tensor t, t_ref; @@ -3414,7 +3419,7 @@ TEST_P(TensorPermuteDevicePairs, AllEqual) { EXPECT_FALSE(src.AllEqual(dst)); } -TEST_P(TensorPermuteDevices, Iterator) { +TEST_P(TensorPermuteDevicesWithSYCL, Iterator) { core::Device device = GetParam(); core::Tensor t; @@ -3492,7 +3497,7 @@ TEST_P(TensorPermuteDevices, Iterator) { } } -TEST_P(TensorPermuteDevices, ConstIterator) { +TEST_P(TensorPermuteDevicesWithSYCL, ConstIterator) { core::Device device = GetParam(); core::Tensor t; @@ -3565,7 +3570,7 @@ TEST_P(TensorPermuteDevices, ConstIterator) { } } -TEST_P(TensorPermuteDevices, TakeOwnership) { +TEST_P(TensorPermuteDevicesWithSYCL, TakeOwnership) { core::Device device = GetParam(); if (!device.IsCPU()) { GTEST_SKIP(); diff --git a/cpp/tests/core/TensorCheck.cpp b/cpp/tests/core/TensorCheck.cpp index 8b789b481b6..2ab52141806 100644 --- a/cpp/tests/core/TensorCheck.cpp +++ b/cpp/tests/core/TensorCheck.cpp @@ -14,12 +14,13 @@ namespace open3d { namespace tests { -class TensorCheckPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(Tensor, - TensorCheckPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TensorCheckPermuteDevicesWithSYCL : public PermuteDevices {}; +INSTANTIATE_TEST_SUITE_P( + Tensor, + TensorCheckPermuteDevicesWithSYCL, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); -TEST_P(TensorCheckPermuteDevices, AssertTensorDtype) { +TEST_P(TensorCheckPermuteDevicesWithSYCL, AssertTensorDtype) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Empty({}, core::Float32, device); @@ -58,7 +59,7 @@ TEST_P(TensorCheckPermuteDevices, AssertTensorDtype) { } } -TEST_P(TensorCheckPermuteDevices, AssertTensorDtypes) { +TEST_P(TensorCheckPermuteDevicesWithSYCL, AssertTensorDtypes) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Empty({}, core::Float32, device); @@ -87,7 +88,7 @@ TEST_P(TensorCheckPermuteDevices, AssertTensorDtypes) { t, std::vector({core::Int32, core::Int64}))); } -TEST_P(TensorCheckPermuteDevices, AssertTensorDevice) { +TEST_P(TensorCheckPermuteDevicesWithSYCL, AssertTensorDevice) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Empty({}, core::Float32, device); @@ -105,7 +106,7 @@ TEST_P(TensorCheckPermuteDevices, AssertTensorDevice) { } } -TEST_P(TensorCheckPermuteDevices, AssertTensorShape) { +TEST_P(TensorCheckPermuteDevicesWithSYCL, AssertTensorShape) { core::Device device = GetParam(); core::Tensor t; diff --git a/cpp/tests/core/TensorFunction.cpp b/cpp/tests/core/TensorFunction.cpp index 0d25b941dfd..8d97fe59720 100644 --- a/cpp/tests/core/TensorFunction.cpp +++ b/cpp/tests/core/TensorFunction.cpp @@ -15,12 +15,12 @@ namespace open3d { namespace tests { -class TensorFunctionPermuteDevices : public PermuteDevices {}; +class TensorFunctionPermuteDevicesWithSYCL : public PermuteDevices {}; INSTANTIATE_TEST_SUITE_P(Tensor, - TensorFunctionPermuteDevices, + TensorFunctionPermuteDevicesWithSYCL, testing::ValuesIn(PermuteDevices::TestCases())); -TEST_P(TensorFunctionPermuteDevices, Concatenate) { +TEST_P(TensorFunctionPermuteDevicesWithSYCL, Concatenate) { core::Device device = GetParam(); core::Tensor a, b, c, output_tensor; @@ -105,13 +105,13 @@ TEST_P(TensorFunctionPermuteDevices, Concatenate) { // Taking the above case of [1, 2] to [2, 2] with different dtype and // device. EXPECT_ANY_THROW(core::Concatenate({a, b.To(core::Float64), c})); - if (device.IsCUDA()) { + if (!device.IsCPU()) { EXPECT_ANY_THROW( core::Concatenate({a, b.To(core::Device("CPU:0")), c})); } } -TEST_P(TensorFunctionPermuteDevices, Append) { +TEST_P(TensorFunctionPermuteDevicesWithSYCL, Append) { core::Device device = GetParam(); core::Tensor self, other, output; @@ -205,7 +205,7 @@ TEST_P(TensorFunctionPermuteDevices, Append) { // Taking the above case of [1, 2] to [2, 2] with different dtype and // device. EXPECT_ANY_THROW(core::Append(self, other.To(core::Float64))); - if (device.IsCUDA()) { + if (!device.IsCPU()) { EXPECT_ANY_THROW(core::Append(self, other.To(core::Device("CPU:0")))); } @@ -215,7 +215,7 @@ TEST_P(TensorFunctionPermuteDevices, Append) { EXPECT_TRUE(core::Append(self, other).AllClose(self.Append(other))); } -TEST_P(TensorFunctionPermuteDevices, Maximum) { +TEST_P(TensorFunctionPermuteDevicesWithSYCL, Maximum) { core::Device device = GetParam(); core::Tensor input, other, output; @@ -268,7 +268,7 @@ TEST_P(TensorFunctionPermuteDevices, Maximum) { core::Tensor::Init({true, true, true, true}, device))); } -TEST_P(TensorFunctionPermuteDevices, Minimum) { +TEST_P(TensorFunctionPermuteDevicesWithSYCL, Minimum) { core::Device device = GetParam(); core::Tensor input, other, output; diff --git a/cpp/tests/core/TensorObject.cpp b/cpp/tests/core/TensorObject.cpp index 0ff294dbc88..bfd28430b85 100644 --- a/cpp/tests/core/TensorObject.cpp +++ b/cpp/tests/core/TensorObject.cpp @@ -22,10 +22,11 @@ namespace open3d { namespace tests { -class TensorObjectPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(TensorObject, - TensorObjectPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TensorObjectPermuteDevicesWithSYCL : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + TensorObject, + TensorObjectPermuteDevicesWithSYCL, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); class TensorObjectPermuteDevicePairs : public PermuteDevicePairs {}; INSTANTIATE_TEST_SUITE_P( @@ -51,7 +52,7 @@ static_assert(std::is_pod(), "TestObject must be a POD."); static const int64_t byte_size = sizeof(TestObject); static const std::string class_name = "TestObject"; -TEST_P(TensorObjectPermuteDevices, Constructor) { +TEST_P(TensorObjectPermuteDevicesWithSYCL, Constructor) { core::Device device = GetParam(); core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name); @@ -70,7 +71,7 @@ TEST_P(TensorObjectPermuteDevices, Constructor) { EXPECT_ANY_THROW(core::Tensor({-1, -1}, dtype, device)); } -TEST_P(TensorObjectPermuteDevices, WithInitValueObject) { +TEST_P(TensorObjectPermuteDevicesWithSYCL, WithInitValueObject) { core::Device device = GetParam(); core::Dtype dtype = core::Dtype(core::Dtype::DtypeCode::Object, byte_size, class_name); @@ -81,7 +82,7 @@ TEST_P(TensorObjectPermuteDevices, WithInitValueObject) { EXPECT_EQ(t.ToFlatVector(), vals); } -TEST_P(TensorObjectPermuteDevices, FillObject) { +TEST_P(TensorObjectPermuteDevicesWithSYCL, FillObject) { core::Device device = GetParam(); core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name); @@ -156,7 +157,7 @@ TEST_P(TensorObjectPermuteDevicePairs, CopyBroadcastObject) { EXPECT_EQ(dst_t.ToFlatVector(), dst_vals); } -TEST_P(TensorObjectPermuteDevices, ItemObject) { +TEST_P(TensorObjectPermuteDevicesWithSYCL, ItemObject) { core::Device device = GetParam(); core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name); @@ -172,7 +173,7 @@ TEST_P(TensorObjectPermuteDevices, ItemObject) { EXPECT_EQ(t[2].Item(), TestObject(4)); } -TEST_P(TensorObjectPermuteDevices, ItemAssignObject) { +TEST_P(TensorObjectPermuteDevicesWithSYCL, ItemAssignObject) { core::Device device = GetParam(); core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name); @@ -187,7 +188,7 @@ TEST_P(TensorObjectPermuteDevices, ItemAssignObject) { EXPECT_EQ(t[1][2][3].Item(), TestObject(100)); } -TEST_P(TensorObjectPermuteDevices, IsSameObject) { +TEST_P(TensorObjectPermuteDevicesWithSYCL, IsSameObject) { core::Device device = GetParam(); core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name); @@ -230,7 +231,7 @@ TEST_P(TensorObjectPermuteDevices, IsSameObject) { EXPECT_TRUE(vec[0].IsSame(vec[1])); } -TEST_P(TensorObjectPermuteDevices, ConstructFromObjectTensorVector) { +TEST_P(TensorObjectPermuteDevicesWithSYCL, ConstructFromObjectTensorVector) { core::Device device = GetParam(); core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name); @@ -255,7 +256,7 @@ TEST_P(TensorObjectPermuteDevices, ConstructFromObjectTensorVector) { EXPECT_FALSE(tl[2].IsSame(t2)); } -TEST_P(TensorObjectPermuteDevices, TensorListFromObjectTensor) { +TEST_P(TensorObjectPermuteDevicesWithSYCL, TensorListFromObjectTensor) { core::Device device = GetParam(); core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name); diff --git a/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp b/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp index 2139b40263c..d0a7189edd8 100644 --- a/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp +++ b/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp @@ -19,12 +19,14 @@ namespace open3d { namespace tests { -class AxisAlignedBoundingBoxPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(AxisAlignedBoundingBox, - AxisAlignedBoundingBoxPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class AxisAlignedBoundingBoxPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + AxisAlignedBoundingBox, + AxisAlignedBoundingBoxPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); -class AxisAlignedBoundingBoxPermuteDevicePairs : public PermuteDevicePairs {}; +class AxisAlignedBoundingBoxPermuteDevicePairs + : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( AxisAlignedBoundingBox, AxisAlignedBoundingBoxPermuteDevicePairs, diff --git a/cpp/tests/t/geometry/LineSet.cpp b/cpp/tests/t/geometry/LineSet.cpp index 30c86eaa78f..fb6eefb41c1 100644 --- a/cpp/tests/t/geometry/LineSet.cpp +++ b/cpp/tests/t/geometry/LineSet.cpp @@ -16,12 +16,13 @@ namespace open3d { namespace tests { -class LineSetPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(LineSet, - LineSetPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class LineSetPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + LineSet, + LineSetPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); -class LineSetPermuteDevicePairs : public PermuteDevicePairs {}; +class LineSetPermuteDevicePairs : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( LineSet, LineSetPermuteDevicePairs, diff --git a/cpp/tests/t/geometry/OrientedBoundingBox.cpp b/cpp/tests/t/geometry/OrientedBoundingBox.cpp index 5a730f4cfb2..8ac9cf7e737 100644 --- a/cpp/tests/t/geometry/OrientedBoundingBox.cpp +++ b/cpp/tests/t/geometry/OrientedBoundingBox.cpp @@ -19,12 +19,14 @@ namespace open3d { namespace tests { -class OrientedBoundingBoxPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(OrientedBoundingBox, - OrientedBoundingBoxPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class OrientedBoundingBoxPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + OrientedBoundingBox, + OrientedBoundingBoxPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); -class OrientedBoundingBoxPermuteDevicePairs : public PermuteDevicePairs {}; +class OrientedBoundingBoxPermuteDevicePairs + : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( OrientedBoundingBox, OrientedBoundingBoxPermuteDevicePairs, diff --git a/cpp/tests/t/geometry/PointCloud.cpp b/cpp/tests/t/geometry/PointCloud.cpp index 2a74c9c75c1..604d91f7358 100644 --- a/cpp/tests/t/geometry/PointCloud.cpp +++ b/cpp/tests/t/geometry/PointCloud.cpp @@ -24,12 +24,13 @@ namespace open3d { namespace tests { -class PointCloudPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(PointCloud, - PointCloudPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class PointCloudPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + PointCloud, + PointCloudPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); -class PointCloudPermuteDevicePairs : public PermuteDevicePairs {}; +class PointCloudPermuteDevicePairs : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( PointCloud, PointCloudPermuteDevicePairs, diff --git a/cpp/tests/t/geometry/TensorMap.cpp b/cpp/tests/t/geometry/TensorMap.cpp index edf80230073..9ae1a37e300 100644 --- a/cpp/tests/t/geometry/TensorMap.cpp +++ b/cpp/tests/t/geometry/TensorMap.cpp @@ -15,10 +15,11 @@ namespace open3d { namespace tests { -class TensorMapPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(TensorMap, - TensorMapPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TensorMapPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + TensorMap, + TensorMapPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(TensorMapPermuteDevices, Constructor) { core::Dtype dtype = core::Float32; diff --git a/cpp/tests/t/geometry/TriangleMesh.cpp b/cpp/tests/t/geometry/TriangleMesh.cpp index 7bca9b4a55e..a7ecf816a03 100644 --- a/cpp/tests/t/geometry/TriangleMesh.cpp +++ b/cpp/tests/t/geometry/TriangleMesh.cpp @@ -20,10 +20,11 @@ namespace open3d { namespace tests { -class TriangleMeshPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(TriangleMesh, - TriangleMeshPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TriangleMeshPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + TriangleMesh, + TriangleMeshPermuteDevices, + testing::ValuesIn(TriangleMeshPermuteDevices::TestCases())); TEST_P(TriangleMeshPermuteDevices, DefaultConstructor) { t::geometry::TriangleMesh mesh;