From c325213fbc0e68b5b418a03eda0a25d348248fbc Mon Sep 17 00:00:00 2001
From: Sameer Sheorey <sameer.sheorey@intel.com>
Date: Thu, 26 Dec 2024 16:02:03 -0800
Subject: [PATCH] WIP

---
 3rdparty/find_dependencies.cmake              |  10 +-
 CMakeLists.txt                                |  16 +-
 cmake/Open3DSYCLTargetSources.cmake           |   6 +-
 cpp/open3d/core/CMakeLists.txt                |  49 ++-
 cpp/open3d/core/Device.h                      |   8 +
 cpp/open3d/core/Indexer.h                     |   8 +-
 cpp/open3d/core/Indexer.isph                  |   4 +-
 cpp/open3d/core/ParallelFor.h                 |   5 +
 cpp/open3d/core/ParallelForSYCL.h             |  63 ++++
 cpp/open3d/core/SYCLContext.cpp               |  51 +++-
 cpp/open3d/core/SYCLContext.h                 |  29 +-
 cpp/open3d/core/kernel/Arange.cpp             |   6 +
 cpp/open3d/core/kernel/Arange.h               |   7 +
 cpp/open3d/core/kernel/ArangeSYCL.cpp         |  37 +++
 cpp/open3d/core/kernel/BinaryEW.cpp           |   6 +
 cpp/open3d/core/kernel/BinaryEW.h             |   7 +
 cpp/open3d/core/kernel/BinaryEWSYCL.cpp       | 270 +++++++++++++++++
 cpp/open3d/core/kernel/IndexGetSet.cpp        |   9 +
 cpp/open3d/core/kernel/IndexGetSet.h          |  16 +
 cpp/open3d/core/kernel/IndexGetSetSYCL.cpp    |  76 +++++
 cpp/open3d/core/kernel/IndexReduction.cpp     |   4 +
 cpp/open3d/core/kernel/IndexReduction.h       |   7 +
 cpp/open3d/core/kernel/IndexReductionSYCL.cpp |  61 ++++
 cpp/open3d/core/kernel/NonZero.cpp            |   6 +
 cpp/open3d/core/kernel/NonZero.h              |   4 +
 cpp/open3d/core/kernel/NonZeroSYCL.cpp        |  75 +++++
 cpp/open3d/core/kernel/Reduction.cpp          |  12 +
 cpp/open3d/core/kernel/Reduction.h            |   8 +
 cpp/open3d/core/kernel/ReductionSYCL.cpp      | 164 ++++++++++
 cpp/open3d/core/kernel/UnaryEW.cpp            |  21 ++
 cpp/open3d/core/kernel/UnaryEW.h              |   9 +-
 cpp/open3d/core/kernel/UnaryEWCPU.cpp         |  13 -
 cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp    |  53 ++++
 cpp/open3d/core/kernel/UnaryEWSYCL.cpp        | 286 ++++++++++++++++--
 cpp/open3d/core/linalg/AddMM.cpp              |   7 +
 cpp/open3d/core/linalg/AddMM.h                |  18 ++
 cpp/open3d/core/linalg/AddMMSYCL.cpp          |  49 +++
 cpp/open3d/core/linalg/Inverse.cpp            |  14 +
 cpp/open3d/core/linalg/Inverse.h              |  11 +-
 cpp/open3d/core/linalg/InverseSYCL.cpp        |  48 +++
 cpp/open3d/core/linalg/LU.cpp                 |  28 +-
 cpp/open3d/core/linalg/LUImpl.h               |   9 +
 cpp/open3d/core/linalg/LUSYCL.cpp             |  42 +++
 cpp/open3d/core/linalg/LeastSquares.cpp       |   6 +
 cpp/open3d/core/linalg/LeastSquares.h         |  10 +
 cpp/open3d/core/linalg/LeastSquaresSYCL.cpp   |  46 +++
 cpp/open3d/core/linalg/Matmul.cpp             |   8 +-
 cpp/open3d/core/linalg/Matmul.h               |  10 +
 cpp/open3d/core/linalg/MatmulSYCL.cpp         |  39 +++
 cpp/open3d/core/linalg/SVD.cpp                |  14 +-
 cpp/open3d/core/linalg/SVD.h                  |  11 +
 cpp/open3d/core/linalg/SVDSYCL.cpp            |  48 +++
 cpp/open3d/core/linalg/Solve.cpp              |  11 +-
 cpp/open3d/core/linalg/Solve.h                |  10 +
 cpp/open3d/core/linalg/SolveSYCL.cpp          |  44 +++
 cpp/open3d/core/linalg/Tri.cpp                |  18 ++
 cpp/open3d/core/linalg/TriImpl.h              |  11 +
 cpp/open3d/core/linalg/TriSYCL.cpp            |  82 +++++
 cpp/open3d/t/geometry/kernel/CMakeLists.txt   |   2 +
 cpp/open3d/t/geometry/kernel/IPPImage.cpp     |   1 -
 cpp/tests/core/CMakeLists.txt                 |   8 +
 cpp/tests/core/CoreTest.cpp                   |   9 +-
 cpp/tests/core/CoreTest.h                     |   4 +
 cpp/tests/core/Linalg.cpp                     |   9 +-
 cpp/tests/core/ParallelForSYCL.cpp            |  65 ++++
 cpp/tests/core/Tensor.cpp                     | 223 +++++++-------
 cpp/tests/core/TensorCheck.cpp                |  17 +-
 cpp/tests/core/TensorFunction.cpp             |  16 +-
 cpp/tests/core/TensorObject.cpp               |  25 +-
 .../t/geometry/AxisAlignedBoundingBox.cpp     |  12 +-
 cpp/tests/t/geometry/LineSet.cpp              |  11 +-
 cpp/tests/t/geometry/OrientedBoundingBox.cpp  |  12 +-
 cpp/tests/t/geometry/PointCloud.cpp           |  11 +-
 cpp/tests/t/geometry/TensorMap.cpp            |   9 +-
 cpp/tests/t/geometry/TriangleMesh.cpp         |   9 +-
 75 files changed, 2164 insertions(+), 279 deletions(-)
 create mode 100644 cpp/open3d/core/ParallelForSYCL.h
 create mode 100644 cpp/open3d/core/kernel/ArangeSYCL.cpp
 create mode 100644 cpp/open3d/core/kernel/BinaryEWSYCL.cpp
 create mode 100644 cpp/open3d/core/kernel/IndexGetSetSYCL.cpp
 create mode 100644 cpp/open3d/core/kernel/IndexReductionSYCL.cpp
 create mode 100644 cpp/open3d/core/kernel/NonZeroSYCL.cpp
 create mode 100644 cpp/open3d/core/kernel/ReductionSYCL.cpp
 create mode 100644 cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp
 create mode 100644 cpp/open3d/core/linalg/AddMMSYCL.cpp
 create mode 100644 cpp/open3d/core/linalg/InverseSYCL.cpp
 create mode 100644 cpp/open3d/core/linalg/LUSYCL.cpp
 create mode 100644 cpp/open3d/core/linalg/LeastSquaresSYCL.cpp
 create mode 100644 cpp/open3d/core/linalg/MatmulSYCL.cpp
 create mode 100644 cpp/open3d/core/linalg/SVDSYCL.cpp
 create mode 100644 cpp/open3d/core/linalg/SolveSYCL.cpp
 create mode 100644 cpp/open3d/core/linalg/TriSYCL.cpp
 create mode 100644 cpp/tests/core/ParallelForSYCL.cpp

diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake
index d2afbe405c9..62ea33ee1fe 100644
--- a/3rdparty/find_dependencies.cmake
+++ b/3rdparty/find_dependencies.cmake
@@ -1532,12 +1532,14 @@ open3d_import_3rdparty_library(3rdparty_uvatlas
 list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_uvatlas)
 
 
+# SYCL link options are specified here. Compile options are only applied to SYCL source files and are specified in cmake/Open3DSYCLTargetSources.cmake
 if(BUILD_SYCL_MODULE)
     add_library(3rdparty_sycl INTERFACE)
     target_link_libraries(3rdparty_sycl INTERFACE
         $<$<AND:$<CXX_COMPILER_ID:IntelLLVM>,$<NOT:$<LINK_LANGUAGE:ISPC>>>:sycl>)
     target_link_options(3rdparty_sycl INTERFACE
-        $<$<AND:$<CXX_COMPILER_ID:IntelLLVM>,$<NOT:$<LINK_LANGUAGE:ISPC>>>:-fsycl -fsycl-targets=spir64_x86_64>)
+        $<$<AND:$<CXX_COMPILER_ID:IntelLLVM>,$<NOT:$<LINK_LANGUAGE:ISPC>>>:-fsycl -fsycl-targets=intel_gpu_acm_g10>)
+        # $<$<AND:$<CXX_COMPILER_ID:IntelLLVM>,$<NOT:$<LINK_LANGUAGE:ISPC>>>:-fsycl -fsycl-targets=spir64,spir64_gen>)
     if(NOT BUILD_SHARED_LIBS OR arg_PUBLIC)
         install(TARGETS 3rdparty_sycl EXPORT Open3DTargets)
     endif()
@@ -1563,8 +1565,12 @@ if(OPEN3D_USE_ONEAPI_PACKAGES)
         GROUPED
         INCLUDE_DIRS ${MKL_INCLUDE}/
         LIB_DIR      ${MKL_ROOT}/lib/intel64
-        LIBRARIES    mkl_intel_ilp64 mkl_tbb_thread mkl_core
+        LIBRARIES    $<$<BOOL:${BUILD_SYCL_MODULE}>:mkl_sycl> mkl_intel_ilp64 mkl_tbb_thread mkl_core
     )
+    if (BUILD_SYCL_MODULE)
+    # target_link_options(3rdparty_mkl INTERFACE "-Wl,-export-dynamic")
+        target_link_libraries(3rdparty_mkl INTERFACE OpenCL)
+    endif()
     # MKL definitions
     target_compile_options(3rdparty_mkl INTERFACE "$<$<PLATFORM_ID:Linux,Darwin>:$<$<COMPILE_LANGUAGE:CXX>:-m64>>")
     target_compile_definitions(3rdparty_mkl INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:MKL_ILP64>")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1bd4706288..635e43e6abf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -282,15 +282,19 @@ endif()
 if(BUILD_SYCL_MODULE AND NOT GLIBCXX_USE_CXX11_ABI)
     message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires GLIBCXX_USE_CXX11_ABI=ON")
 endif()
-if(BUILD_SYCL_MODULE AND BUILD_TENSORFLOW_OPS)
-    message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_TENSORFLOW_OPS=OFF")
-endif()
-if(BUILD_SYCL_MODULE AND BUILD_PYTORCH_OPS)
-    message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_PYTORCH_OPS=OFF")
-endif()
+# if(BUILD_SYCL_MODULE AND BUILD_TENSORFLOW_OPS)
+#     message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_TENSORFLOW_OPS=OFF")
+# endif()
+# if(BUILD_SYCL_MODULE AND BUILD_PYTORCH_OPS)
+#     message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_PYTORCH_OPS=OFF")
+# endif()
 if(BUILD_SYCL_MODULE AND BUILD_CUDA_MODULE)
     message(FATAL_ERROR "BUILD_SYCL_MODULE and BUILD_SYCL_MODULE cannot be on at the same time for now.")
 endif()
+# Use LLD with icpx for faster linking
+if (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
+    add_link_options("-fuse-ld=lld")
+endif()
 
 # Global flag to set CXX standard.
 # This does not affect 3rd party libraries.
diff --git a/cmake/Open3DSYCLTargetSources.cmake b/cmake/Open3DSYCLTargetSources.cmake
index 9b0220c10ab..baa075f38e7 100644
--- a/cmake/Open3DSYCLTargetSources.cmake
+++ b/cmake/Open3DSYCLTargetSources.cmake
@@ -2,7 +2,8 @@
 #
 # When BUILD_SYCL_MODULE=ON, set SYCL-specific compile flags for the listed
 # source files and call target_sources(). If BUILD_SYCL_MODULE=OFF, this
-# function directly calls target_sources().
+# function directly calls target_sources(). For SYCL link options, see 
+# 3rdparty/find_dependencies.cmake
 #
 # Note: this is not a perfect forwarding to target_sources(), as it only support
 # limited set of arguments. See the example usage below.
@@ -31,7 +32,8 @@ function(open3d_sycl_target_sources target)
         if(BUILD_SYCL_MODULE)
             foreach(sycl_file IN LISTS arg_UNPARSED_ARGUMENTS)
                 set_source_files_properties(${sycl_file} PROPERTIES
-                    COMPILE_OPTIONS -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_x86_64)
+                    COMPILE_OPTIONS "-fsycl;-fsycl-targets=intel_gpu_acm_g10")
+                    #COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64,spir64_gen")
                 if(arg_VERBOSE)
                     message(STATUS "open3d_sycl_target_sources(${target}): marked ${sycl_file} as SYCL code")
                 endif()
diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt
index c34cea3fc6d..210cc33058a 100644
--- a/cpp/open3d/core/CMakeLists.txt
+++ b/cpp/open3d/core/CMakeLists.txt
@@ -5,7 +5,6 @@ target_sources(core PRIVATE
     CUDAUtils.cpp
     Device.cpp
     Dtype.cpp
-    EigenConverter.cpp
     Indexer.cpp
     MemoryManager.cpp
     MemoryManagerCached.cpp
@@ -23,6 +22,7 @@ target_sources(core PRIVATE
 
 # Compile regardless BUILD_SYCL_MODULE == ON or OFF.
 open3d_sycl_target_sources(core PRIVATE
+    EigenConverter.cpp
     SYCLUtils.cpp
 )
 
@@ -43,22 +43,14 @@ target_sources(core PRIVATE
     hashmap/HashSet.cpp
     kernel/Kernel.cpp
     linalg/AddMM.cpp
-    linalg/AddMMCPU.cpp
     linalg/Det.cpp
     linalg/Inverse.cpp
-    linalg/InverseCPU.cpp
     linalg/LeastSquares.cpp
-    linalg/LeastSquaresCPU.cpp
     linalg/LU.cpp
-    linalg/LUCPU.cpp
     linalg/Matmul.cpp
-    linalg/MatmulCPU.cpp
     linalg/Solve.cpp
-    linalg/SolveCPU.cpp
     linalg/SVD.cpp
-    linalg/SVDCPU.cpp
     linalg/Tri.cpp
-    linalg/TriCPU.cpp
     nns/FixedRadiusIndex.cpp
     nns/FixedRadiusSearchOps.cpp
     nns/KnnIndex.cpp
@@ -73,21 +65,48 @@ set_target_properties(core_impl PROPERTIES CXX_VISIBILITY_PRESET "hidden")
 
 target_sources(core_impl PRIVATE
     kernel/Arange.cpp
-    kernel/ArangeCPU.cpp
     kernel/BinaryEW.cpp
-    kernel/BinaryEWCPU.cpp
     kernel/IndexGetSet.cpp
-    kernel/IndexGetSetCPU.cpp
     kernel/IndexReduction.cpp
-    kernel/IndexReductionCPU.cpp
     kernel/NonZero.cpp
-    kernel/NonZeroCPU.cpp
     kernel/Reduction.cpp
-    kernel/ReductionCPU.cpp
     kernel/UnaryEW.cpp
+    kernel/ArangeCPU.cpp
+    kernel/BinaryEWCPU.cpp
+    kernel/IndexGetSetCPU.cpp
+    kernel/IndexReductionCPU.cpp
+    kernel/NonZeroCPU.cpp
+    kernel/ReductionCPU.cpp
     kernel/UnaryEWCPU.cpp
+    linalg/AddMMCPU.cpp
+    linalg/InverseCPU.cpp
+    linalg/LeastSquaresCPU.cpp
+    linalg/LUCPU.cpp
+    linalg/MatmulCPU.cpp
+    linalg/SolveCPU.cpp
+    linalg/SVDCPU.cpp
+    linalg/TriCPU.cpp
+)
+
+if (BUILD_SYCL_MODULE)
+open3d_sycl_target_sources(core_impl PRIVATE
     kernel/UnaryEWSYCL.cpp
+    kernel/BinaryEWSYCL.cpp
+    kernel/ArangeSYCL.cpp
+    kernel/IndexGetSetSYCL.cpp
+    kernel/NonZeroSYCL.cpp
+    kernel/IndexReductionSYCL.cpp
+    kernel/ReductionSYCL.cpp
+    linalg/AddMMSYCL.cpp
+    linalg/InverseSYCL.cpp
+    linalg/LeastSquaresSYCL.cpp
+    linalg/LUSYCL.cpp
+    linalg/MatmulSYCL.cpp
+    linalg/SolveSYCL.cpp
+    linalg/SVDSYCL.cpp
+    linalg/TriSYCL.cpp
 )
+endif()
 
 if (BUILD_CUDA_MODULE)
     target_sources(core PRIVATE
diff --git a/cpp/open3d/core/Device.h b/cpp/open3d/core/Device.h
index a79d9cf646c..d215d16e55e 100644
--- a/cpp/open3d/core/Device.h
+++ b/cpp/open3d/core/Device.h
@@ -115,4 +115,12 @@ struct hash<open3d::core::Device> {
         return std::hash<std::string>{}(device.ToString());
     }
 };
+
+template <>
+struct less<open3d::core::Device> {
+    bool operator()(const open3d::core::Device& lhs,
+                    const open3d::core::Device& rhs) const {
+        return lhs.ToString() < rhs.ToString();
+    }
+};
 }  // namespace std
diff --git a/cpp/open3d/core/Indexer.h b/cpp/open3d/core/Indexer.h
index c9eb8933d49..4fa7a61c21b 100644
--- a/cpp/open3d/core/Indexer.h
+++ b/cpp/open3d/core/Indexer.h
@@ -34,11 +34,11 @@ class Indexer;
 class IndexerIterator;
 
 // Maximum number of dimensions of TensorRef.
-static constexpr int64_t MAX_DIMS = 10;
+static constexpr int64_t MAX_DIMS = 4;
 
 // Maximum number of inputs of an op.
 // MAX_INPUTS shall be >= MAX_DIMS to support advanced indexing.
-static constexpr int64_t MAX_INPUTS = 10;
+static constexpr int64_t MAX_INPUTS = 4;
 
 // Maximum number of outputs of an op. This number can be increased when
 // necessary.
@@ -110,7 +110,7 @@ struct TensorRef {
 
     TensorRef(const Tensor& t) {
         if (t.NumDims() > MAX_DIMS) {
-            utility::LogError("Tenor has too many dimensions {} > {}.",
+            utility::LogError("Tensor has too many dimensions {} > {}.",
                               t.NumDims(), MAX_DIMS);
         }
         data_ptr_ = const_cast<void*>(t.GetDataPtr());
@@ -638,7 +638,7 @@ class Indexer {
 class IndexerIterator {
 public:
     struct Iterator {
-        Iterator(){};
+        Iterator() {};
         Iterator(const Indexer& indexer);
         Iterator(Iterator&& other) = default;
 
diff --git a/cpp/open3d/core/Indexer.isph b/cpp/open3d/core/Indexer.isph
index c9b63486377..af7629d3f06 100644
--- a/cpp/open3d/core/Indexer.isph
+++ b/cpp/open3d/core/Indexer.isph
@@ -11,11 +11,11 @@
 #include "open3d/utility/Helper.isph"
 
 // Maximum number of dimensions of TensorRef.
-enum { MAX_DIMS = 10 };
+enum { MAX_DIMS = 4 };
 
 // Maximum number of inputs of an op.
 // MAX_INPUTS shall be >= MAX_DIMS to support advanced indexing.
-enum { MAX_INPUTS = 10 };
+enum { MAX_INPUTS = 4 };
 
 // Maximum number of outputs of an op. This number can be increased when
 // necessary.
diff --git a/cpp/open3d/core/ParallelFor.h b/cpp/open3d/core/ParallelFor.h
index 2d5bef78812..9e917789947 100644
--- a/cpp/open3d/core/ParallelFor.h
+++ b/cpp/open3d/core/ParallelFor.h
@@ -99,6 +99,11 @@ void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
 /// \note If you use a lambda function, capture only the required variables
 /// instead of all to prevent accidental race conditions. If you want the
 /// kernel to be used on both CPU and CUDA, capture the variables by value.
+/// \note This does not dispatch to SYCL, since SYCL has extra constraints:
+///      - Lambdas may capture by value only.
+///      - No function pointers / virtual functions.
+/// Auto dispatch to SYCL will enforce these conditions even on CPU devices. Use
+/// ParallelForSYCL instead.
 template <typename func_t>
 void ParallelFor(const Device& device, int64_t n, const func_t& func) {
 #ifdef __CUDACC__
diff --git a/cpp/open3d/core/ParallelForSYCL.h b/cpp/open3d/core/ParallelForSYCL.h
new file mode 100644
index 00000000000..d74d4853734
--- /dev/null
+++ b/cpp/open3d/core/ParallelForSYCL.h
@@ -0,0 +1,63 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include "open3d/core/Device.h"
+#include "open3d/core/Indexer.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+
+/// Run a function in parallel with SYCL.
+template <typename Functor, typename... FuncArgs>
+void ParallelForSYCL(const Device& device,
+                     Indexer indexer,
+                     FuncArgs... func_args) {
+    if (!device.IsSYCL()) {
+        utility::LogError("ParallelFor for SYCL cannot run on device {}.",
+                          device.ToString());
+    }
+    int64_t n = indexer.NumWorkloads();
+    if (n == 0) {
+        return;
+    }
+    auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    /// TODO: Specify grid size based on device properties
+    queue.parallel_for<Functor>(n, [indexer, func_args...](int64_t i) {
+             Functor ef(indexer, func_args...);
+             ef(i);
+         }).wait_and_throw();
+}
+
+/// Run a function in parallel with SYCL.
+template <typename Functor, typename... FuncArgs>
+void ParallelForSYCL(const Device& device,
+                     int64_t num_workloads,
+                     FuncArgs... func_args) {
+    if (!device.IsSYCL()) {
+        utility::LogError("ParallelFor for SYCL cannot run on device {}.",
+                          device.ToString());
+    }
+    if (num_workloads == 0) {
+        return;
+    }
+    auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    /// TODO: Specify grid size based on device properties
+    queue.parallel_for<Functor>(num_workloads, [func_args...](int64_t i) {
+             Functor ef(func_args...);
+             ef(i);
+         }).wait_and_throw();
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/SYCLContext.cpp b/cpp/open3d/core/SYCLContext.cpp
index 30c5f6f6afc..2264b67b056 100644
--- a/cpp/open3d/core/SYCLContext.cpp
+++ b/cpp/open3d/core/SYCLContext.cpp
@@ -27,19 +27,44 @@ SYCLContext &SYCLContext::GetInstance() {
 bool SYCLContext::IsAvailable() { return devices_.size() > 0; }
 
 bool SYCLContext::IsDeviceAvailable(const Device &device) {
-    bool rc = false;
-    for (const Device &device_ : devices_) {
-        if (device == device_) {
-            rc = true;
-            break;
-        }
+    return devices_.find(device) != devices_.end();
+}
+std::vector<Device> SYCLContext::GetAvailableSYCLDevices() {
+    std::vector<Device> device_vec;
+    for (const auto &device : devices_) {
+        device_vec.push_back(device.first);
     }
-    return rc;
+    return device_vec;
 }
-std::vector<Device> SYCLContext::GetAvailableSYCLDevices() { return devices_; }
 
 sycl::queue SYCLContext::GetDefaultQueue(const Device &device) {
-    return device_to_default_queue_.at(device);
+    return devices_.at(device).queue;
+}
+
+SYCLDevice::SYCLDevice(const sycl::device &sycl_device) {
+    namespace sid = sycl::info::device;
+    device = sycl_device;
+    queue = sycl::queue(device);
+    name = device.get_info<sid::name>();
+    max_work_group_size = device.get_info<sid::max_work_group_size>();
+    auto aspects = device.get_info<sid::aspects>();
+    fp64 = std::find(aspects.begin(), aspects.end(), sycl::aspect::fp64) !=
+           aspects.end();
+    if (!fp64) {
+        utility::LogWarning(
+                "SYCL device {} does not support double precision. Using "
+                "emulation.",
+                name);
+    }
+    usm_device_allocations =
+            std::find(aspects.begin(), aspects.end(),
+                      sycl::aspect::usm_device_allocations) != aspects.end();
+    if (!usm_device_allocations) {
+        utility::LogWarning(
+                "SYCL device {} does not support USM device allocations. "
+                "Open3D SYCL support may not work.",
+                name);
+    }
 }
 
 SYCLContext::SYCLContext() {
@@ -48,9 +73,7 @@ SYCLContext::SYCLContext() {
     try {
         const sycl::device &sycl_device = sycl::device(sycl::gpu_selector_v);
         const Device open3d_device = Device("SYCL:0");
-        devices_.push_back(open3d_device);
-        device_to_sycl_device_[open3d_device] = sycl_device;
-        device_to_default_queue_[open3d_device] = sycl::queue(sycl_device);
+        devices_.emplace(open3d_device, sycl_device);
     } catch (const sycl::exception &e) {
     }
 
@@ -66,9 +89,7 @@ SYCLContext::SYCLContext() {
         const sycl::device &sycl_device = sycl::device(sycl::cpu_selector_v);
         const Device open3d_device =
                 Device("SYCL:" + std::to_string(devices_.size()));
-        devices_.push_back(open3d_device);
-        device_to_sycl_device_[open3d_device] = sycl_device;
-        device_to_default_queue_[open3d_device] = sycl::queue(sycl_device);
+        devices_.emplace(open3d_device, sycl_device);
     } catch (const sycl::exception &e) {
     }
 
diff --git a/cpp/open3d/core/SYCLContext.h b/cpp/open3d/core/SYCLContext.h
index 341da29369e..5c2a160b75e 100644
--- a/cpp/open3d/core/SYCLContext.h
+++ b/cpp/open3d/core/SYCLContext.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include <map>
 #include <sycl/sycl.hpp>
-#include <unordered_map>
 
 #include "open3d/core/Device.h"
 
@@ -23,6 +23,18 @@ namespace open3d {
 namespace core {
 namespace sy {
 
+/// @brief SYCL device properties.
+struct SYCLDevice {
+    SYCLDevice(const sycl::device& sycl_device);
+    std::string name;            ///< Fiendlly / descriptive name of the device.
+    sycl::device device;         ///< SYCL device.
+    sycl::queue queue;           ///< Default queue for this device.
+    size_t max_work_group_size;  ///< Preferred work group size
+    bool fp64;  ///< Double precision support, else need to emulate.
+    bool usm_device_allocations;  ///< USM device allocations required for
+                                  ///< Open3D.
+};
+
 /// Singleton SYCL context manager. It maintains:
 /// - A default queue for each SYCL device
 class SYCLContext {
@@ -45,17 +57,16 @@ class SYCLContext {
     /// Get the default SYCL queue given an Open3D device.
     sycl::queue GetDefaultQueue(const Device& device);
 
+    /// Get SYCL device properties given an Open3D device.
+    SYCLDevice GetDeviceProperties(const Device& device) {
+        return devices_.at(device);
+    };
+
 private:
     SYCLContext();
 
-    /// List of available Open3D SYCL devices.
-    std::vector<Device> devices_;
-
-    /// Maps core::Device to the corresponding default SYCL queue.
-    std::unordered_map<Device, sycl::queue> device_to_default_queue_;
-
-    /// Maps core::Device to sycl::device. Internal use only for now.
-    std::unordered_map<Device, sycl::device> device_to_sycl_device_;
+    /// Map from available Open3D SYCL devices to their properties.
+    std::map<Device, SYCLDevice> devices_;
 };
 
 }  // namespace sy
diff --git a/cpp/open3d/core/kernel/Arange.cpp b/cpp/open3d/core/kernel/Arange.cpp
index b3385a2a10c..3c85401eece 100644
--- a/cpp/open3d/core/kernel/Arange.cpp
+++ b/cpp/open3d/core/kernel/Arange.cpp
@@ -63,6 +63,12 @@ Tensor Arange(const Tensor& start, const Tensor& stop, const Tensor& step) {
 
     if (device.IsCPU()) {
         ArangeCPU(start, stop, step, dst);
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ArangeSYCL(start, stop, step, dst);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         ArangeCUDA(start, stop, step, dst);
diff --git a/cpp/open3d/core/kernel/Arange.h b/cpp/open3d/core/kernel/Arange.h
index 1a0e88323fd..4547960b5f4 100644
--- a/cpp/open3d/core/kernel/Arange.h
+++ b/cpp/open3d/core/kernel/Arange.h
@@ -20,6 +20,13 @@ void ArangeCPU(const Tensor& start,
                const Tensor& step,
                Tensor& dst);
 
+#ifdef BUILD_SYCL_MODULE
+void ArangeSYCL(const Tensor& start,
+                const Tensor& stop,
+                const Tensor& step,
+                Tensor& dst);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void ArangeCUDA(const Tensor& start,
                 const Tensor& stop,
diff --git a/cpp/open3d/core/kernel/ArangeSYCL.cpp b/cpp/open3d/core/kernel/ArangeSYCL.cpp
new file mode 100644
index 00000000000..cda8912be90
--- /dev/null
+++ b/cpp/open3d/core/kernel/ArangeSYCL.cpp
@@ -0,0 +1,37 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/kernel/Arange.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+void ArangeSYCL(const Tensor& start,
+                const Tensor& stop,
+                const Tensor& step,
+                Tensor& dst) {
+    Dtype dtype = start.GetDtype();
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(start.GetDevice());
+    DISPATCH_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        scalar_t sstart = start.Item<scalar_t>();
+        scalar_t sstep = step.Item<scalar_t>();
+        scalar_t* dst_ptr = dst.GetDataPtr<scalar_t>();
+        int64_t n = dst.GetLength();
+        queue.parallel_for(n, [=](int64_t i) {
+                 dst_ptr[i] = sstart + static_cast<scalar_t>(sstep * i);
+             }).wait_and_throw();
+    });
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/BinaryEW.cpp b/cpp/open3d/core/kernel/BinaryEW.cpp
index 00eb9b388e5..e34122cd137 100644
--- a/cpp/open3d/core/kernel/BinaryEW.cpp
+++ b/cpp/open3d/core/kernel/BinaryEW.cpp
@@ -51,6 +51,12 @@ void BinaryEW(const Tensor& lhs,
 
     if (lhs.IsCPU()) {
         BinaryEWCPU(lhs, rhs, dst, op_code);
+    } else if (lhs.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        BinaryEWSYCL(lhs, rhs, dst, op_code);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (lhs.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         BinaryEWCUDA(lhs, rhs, dst, op_code);
diff --git a/cpp/open3d/core/kernel/BinaryEW.h b/cpp/open3d/core/kernel/BinaryEW.h
index fbd4af3e849..2f2cb57888e 100644
--- a/cpp/open3d/core/kernel/BinaryEW.h
+++ b/cpp/open3d/core/kernel/BinaryEW.h
@@ -48,6 +48,13 @@ void BinaryEWCPU(const Tensor& lhs,
                  Tensor& dst,
                  BinaryEWOpCode op_code);
 
+#ifdef BUILD_SYCL_MODULE
+void BinaryEWSYCL(const Tensor& lhs,
+                  const Tensor& rhs,
+                  Tensor& dst,
+                  BinaryEWOpCode op_code);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void BinaryEWCUDA(const Tensor& lhs,
                   const Tensor& rhs,
diff --git a/cpp/open3d/core/kernel/BinaryEWSYCL.cpp b/cpp/open3d/core/kernel/BinaryEWSYCL.cpp
new file mode 100644
index 00000000000..1466b11f97e
--- /dev/null
+++ b/cpp/open3d/core/kernel/BinaryEWSYCL.cpp
@@ -0,0 +1,270 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/Dtype.h"
+#include "open3d/core/Indexer.h"
+#include "open3d/core/MemoryManager.h"
+#include "open3d/core/ParallelForSYCL.h"
+#include "open3d/core/SizeVector.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/kernel/BinaryEW.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+namespace {
+
+struct BinaryElementKernel {
+    void operator()(int64_t i) {}
+    BinaryElementKernel(Indexer indexer_) : indexer(indexer_) {}
+
+protected:
+    Indexer indexer;
+};
+
+// Min, Max
+#define BINARY_ELEMENT_KERNEL(name, elem_fn)                     \
+    template <typename src_t, typename dst_t = src_t>            \
+    struct name##ElementKernel : public BinaryElementKernel {    \
+        using BinaryElementKernel::BinaryElementKernel;          \
+        void operator()(int64_t i) {                             \
+            const src_t* lhs = indexer.GetInputPtr<src_t>(0, i); \
+            const src_t* rhs = indexer.GetInputPtr<src_t>(1, i); \
+            dst_t* dst = indexer.GetOutputPtr<dst_t>(i);         \
+            *dst = elem_fn(*lhs, *rhs);                          \
+        }                                                        \
+    }
+
+BINARY_ELEMENT_KERNEL(Max, sycl::max);
+BINARY_ELEMENT_KERNEL(Min, sycl::min);
+#undef BINARY_ELEMENT_KERNEL
+
+/// Specialize Min, Max for Bool, since sycl::min, sycl::max do not support it.
+template <>
+struct MaxElementKernel<bool, bool> : public BinaryElementKernel {
+    using BinaryElementKernel::BinaryElementKernel;
+    void operator()(int64_t i) {
+        const bool* lhs = indexer.GetInputPtr<bool>(0, i);
+        const bool* rhs = indexer.GetInputPtr<bool>(1, i);
+        bool* dst = indexer.GetOutputPtr<bool>(i);
+        *dst = *lhs || *rhs;
+    }
+};
+template <>
+struct MinElementKernel<bool, bool> : public BinaryElementKernel {
+    using BinaryElementKernel::BinaryElementKernel;
+    void operator()(int64_t i) {
+        const bool* lhs = indexer.GetInputPtr<bool>(0, i);
+        const bool* rhs = indexer.GetInputPtr<bool>(1, i);
+        bool* dst = indexer.GetOutputPtr<bool>(i);
+        *dst = *lhs && *rhs;
+    }
+};
+
+// Arithmetic and Relational ops.
+#define BINARY_ELEMENT_KERNEL(name, elem_op)                     \
+    template <typename src_t, typename dst_t = src_t>            \
+    struct name##ElementKernel : public BinaryElementKernel {    \
+        using BinaryElementKernel::BinaryElementKernel;          \
+        void operator()(int64_t i) {                             \
+            const src_t* lhs = indexer.GetInputPtr<src_t>(0, i); \
+            const src_t* rhs = indexer.GetInputPtr<src_t>(1, i); \
+            dst_t* dst = indexer.GetOutputPtr<dst_t>(i);         \
+            *dst = (*lhs)elem_op(*rhs);                          \
+        }                                                        \
+    }
+
+BINARY_ELEMENT_KERNEL(Add, +);
+BINARY_ELEMENT_KERNEL(Sub, -);
+BINARY_ELEMENT_KERNEL(Mul, *);
+BINARY_ELEMENT_KERNEL(Div, /);
+BINARY_ELEMENT_KERNEL(Gt, >);
+BINARY_ELEMENT_KERNEL(Lt, <);
+BINARY_ELEMENT_KERNEL(Geq, >=);
+BINARY_ELEMENT_KERNEL(Leq, <=);
+BINARY_ELEMENT_KERNEL(Eq, ==);
+BINARY_ELEMENT_KERNEL(Neq, !=);
+#undef BINARY_ELEMENT_KERNEL
+
+// Logical ops.
+#define BINARY_ELEMENT_KERNEL(name, elem_op)                                \
+    template <typename src_t, typename dst_t = src_t>                       \
+    struct name##ElementKernel : public BinaryElementKernel {               \
+        using BinaryElementKernel::BinaryElementKernel;                     \
+        void operator()(int64_t i) {                                        \
+            const src_t* lhs = indexer.GetInputPtr<src_t>(0, i);            \
+            const src_t* rhs = indexer.GetInputPtr<src_t>(1, i);            \
+            dst_t* dst = indexer.GetOutputPtr<dst_t>(i);                    \
+            *dst = static_cast<bool>(*lhs) elem_op static_cast<bool>(*rhs); \
+        }                                                                   \
+    }
+BINARY_ELEMENT_KERNEL(LogicalAnd, &&);
+BINARY_ELEMENT_KERNEL(LogicalOr, ||);
+BINARY_ELEMENT_KERNEL(LogicalXor, !=);
+#undef BINARY_ELEMENT_KERNEL
+
+}  // namespace
+
+void BinaryEWSYCL(const Tensor& lhs,
+                  const Tensor& rhs,
+                  Tensor& dst,
+                  BinaryEWOpCode op_code) {
+    Dtype src_dtype = lhs.GetDtype();
+    Dtype dst_dtype = dst.GetDtype();
+    Device device = lhs.GetDevice();
+
+    if (s_boolean_binary_ew_op_codes.find(op_code) !=
+        s_boolean_binary_ew_op_codes.end()) {
+        if (dst_dtype == src_dtype) {
+            // Inplace boolean op's output type is the same as the
+            // input. e.g. np.logical_and(a, b, out=a), where a, b are
+            // floats.
+            Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME);
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                switch (op_code) {
+                    case BinaryEWOpCode::LogicalAnd:
+                        ParallelForSYCL<LogicalAndElementKernel<scalar_t>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::LogicalOr:
+                        ParallelForSYCL<LogicalOrElementKernel<scalar_t>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::LogicalXor:
+                        ParallelForSYCL<LogicalXorElementKernel<scalar_t>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Gt:
+                        ParallelForSYCL<GtElementKernel<scalar_t>>(device,
+                                                                   indexer);
+                        break;
+                    case BinaryEWOpCode::Lt:
+                        ParallelForSYCL<LtElementKernel<scalar_t>>(device,
+                                                                   indexer);
+                        break;
+                    case BinaryEWOpCode::Ge:
+                        ParallelForSYCL<GeqElementKernel<scalar_t>>(device,
+                                                                    indexer);
+                        break;
+                    case BinaryEWOpCode::Le:
+                        ParallelForSYCL<LeqElementKernel<scalar_t>>(device,
+                                                                    indexer);
+                        break;
+                    case BinaryEWOpCode::Eq:
+                        ParallelForSYCL<EqElementKernel<scalar_t>>(device,
+                                                                   indexer);
+                        break;
+                    case BinaryEWOpCode::Ne:
+                        ParallelForSYCL<NeqElementKernel<scalar_t>>(device,
+                                                                    indexer);
+                        break;
+                    default:
+                        break;
+                }
+            });
+        } else if (dst_dtype == core::Bool) {
+            // By default, output is boolean type.
+            Indexer indexer({lhs, rhs}, dst,
+                            DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                switch (op_code) {
+                    case BinaryEWOpCode::LogicalAnd:
+                        ParallelForSYCL<
+                                LogicalAndElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::LogicalOr:
+                        ParallelForSYCL<LogicalOrElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::LogicalXor:
+                        ParallelForSYCL<
+                                LogicalXorElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Gt:
+                        ParallelForSYCL<GtElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Lt:
+                        ParallelForSYCL<LtElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Ge:
+                        ParallelForSYCL<GeqElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Le:
+                        ParallelForSYCL<LeqElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Eq:
+                        ParallelForSYCL<EqElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Ne:
+                        ParallelForSYCL<NeqElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    default:
+                        break;
+                }
+            });
+        } else {
+            utility::LogError(
+                    "Boolean op's output type must be boolean or the "
+                    "same type as the input.");
+        }
+    } else if (op_code == BinaryEWOpCode::Maximum ||
+               op_code == BinaryEWOpCode::Minimum) {
+        Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME);
+        DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+            switch (op_code) {
+                case BinaryEWOpCode::Maximum:
+                    ParallelForSYCL<MaxElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case BinaryEWOpCode::Minimum:
+                    ParallelForSYCL<MinElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                default:
+                    break;
+            }
+        });
+    } else {
+        Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME);
+        DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
+            switch (op_code) {
+                case BinaryEWOpCode::Add:
+                    ParallelForSYCL<AddElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case BinaryEWOpCode::Sub:
+                    ParallelForSYCL<SubElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case BinaryEWOpCode::Mul:
+                    ParallelForSYCL<MulElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case BinaryEWOpCode::Div:
+                    ParallelForSYCL<DivElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                default:
+                    break;
+            }
+        });
+    }
+}
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/IndexGetSet.cpp b/cpp/open3d/core/kernel/IndexGetSet.cpp
index 601b4d73f64..b880f50fb51 100644
--- a/cpp/open3d/core/kernel/IndexGetSet.cpp
+++ b/cpp/open3d/core/kernel/IndexGetSet.cpp
@@ -35,6 +35,10 @@ void IndexGet(const Tensor& src,
 
     if (src.IsCPU()) {
         IndexGetCPU(src, dst, index_tensors, indexed_shape, indexed_strides);
+    } else if (src.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        IndexGetSYCL(src, dst, index_tensors, indexed_shape, indexed_strides);
+#endif
     } else if (src.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         IndexGetCUDA(src, dst, index_tensors, indexed_shape, indexed_strides);
@@ -56,6 +60,11 @@ void IndexSet(const Tensor& src,
     if (dst.IsCPU()) {
         IndexSetCPU(src_same_device, dst, index_tensors, indexed_shape,
                     indexed_strides);
+    } else if (dst.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        IndexSetSYCL(src_same_device, dst, index_tensors, indexed_shape,
+                     indexed_strides);
+#endif
     } else if (dst.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         IndexSetCUDA(src_same_device, dst, index_tensors, indexed_shape,
diff --git a/cpp/open3d/core/kernel/IndexGetSet.h b/cpp/open3d/core/kernel/IndexGetSet.h
index 130b80836a9..c4a6d3b22c2 100644
--- a/cpp/open3d/core/kernel/IndexGetSet.h
+++ b/cpp/open3d/core/kernel/IndexGetSet.h
@@ -26,6 +26,14 @@ void IndexGetCPU(const Tensor& src,
                  const SizeVector& indexed_shape,
                  const SizeVector& indexed_strides);
 
+#ifdef BUILD_SYCL_MODULE
+void IndexGetSYCL(const Tensor& src,
+                  Tensor& dst,
+                  const std::vector<Tensor>& index_tensors,
+                  const SizeVector& indexed_shape,
+                  const SizeVector& indexed_strides);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void IndexGetCUDA(const Tensor& src,
                   Tensor& dst,
@@ -46,6 +54,14 @@ void IndexSetCPU(const Tensor& src,
                  const SizeVector& indexed_shape,
                  const SizeVector& indexed_strides);
 
+#ifdef BUILD_SYCL_MODULE
+void IndexSetSYCL(const Tensor& src,
+                  Tensor& dst,
+                  const std::vector<Tensor>& index_tensors,
+                  const SizeVector& indexed_shape,
+                  const SizeVector& indexed_strides);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void IndexSetCUDA(const Tensor& src,
                   Tensor& dst,
diff --git a/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp b/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp
new file mode 100644
index 00000000000..dfc56397417
--- /dev/null
+++ b/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp
@@ -0,0 +1,76 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/AdvancedIndexing.h"
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/kernel/IndexGetSet.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+void IndexGetSYCL(const Tensor& src,
+                  Tensor& dst,
+                  const std::vector<Tensor>& index_tensors,
+                  const SizeVector& indexed_shape,
+                  const SizeVector& indexed_strides) {
+    Dtype dtype = src.GetDtype();
+    AdvancedIndexer ai(src, dst, index_tensors, indexed_shape, indexed_strides,
+                       AdvancedIndexer::AdvancedIndexerMode::GET);
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice());
+    if (dtype.IsObject()) {
+        int64_t object_byte_size = dtype.ByteSize();
+        for (int64_t idx = 0; idx < ai.NumWorkloads(); ++idx) {
+            queue.memcpy(ai.GetOutputPtr(idx), ai.GetInputPtr(idx),
+                         object_byte_size);
+        }
+    } else {
+        DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dtype, [&]() {
+            queue.parallel_for(ai.NumWorkloads(), [ai](int64_t idx) {
+                     // char* -> scalar_t* needs reinterpret_cast
+                     *reinterpret_cast<scalar_t*>(ai.GetOutputPtr(idx)) =
+                             *reinterpret_cast<const scalar_t*>(
+                                     ai.GetInputPtr(idx));
+                 }).wait_and_throw();
+        });
+    }
+}
+
+void IndexSetSYCL(const Tensor& src,
+                  Tensor& dst,
+                  const std::vector<Tensor>& index_tensors,
+                  const SizeVector& indexed_shape,
+                  const SizeVector& indexed_strides) {
+    Dtype dtype = src.GetDtype();
+    AdvancedIndexer ai(src, dst, index_tensors, indexed_shape, indexed_strides,
+                       AdvancedIndexer::AdvancedIndexerMode::SET);
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice());
+    if (dtype.IsObject()) {
+        int64_t object_byte_size = dtype.ByteSize();
+        for (int64_t idx = 0; idx < ai.NumWorkloads(); ++idx) {
+            queue.memcpy(ai.GetOutputPtr(idx), ai.GetInputPtr(idx),
+                         object_byte_size);
+        }
+    } else {
+        DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dtype, [&]() {
+            queue.parallel_for(ai.NumWorkloads(), [ai](int64_t idx) {
+                     // char* -> scalar_t* needs reinterpret_cast
+                     *reinterpret_cast<scalar_t*>(ai.GetOutputPtr(idx)) =
+                             *reinterpret_cast<const scalar_t*>(
+                                     ai.GetInputPtr(idx));
+                 }).wait_and_throw();
+        });
+    }
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/IndexReduction.cpp b/cpp/open3d/core/kernel/IndexReduction.cpp
index 19265d36620..e9c9df3f388 100644
--- a/cpp/open3d/core/kernel/IndexReduction.cpp
+++ b/cpp/open3d/core/kernel/IndexReduction.cpp
@@ -35,6 +35,10 @@ void IndexAdd_(int64_t dim,
 
     if (dst.IsCPU()) {
         IndexAddCPU_(dim, index, src_permute, dst_permute);
+    } else if (dst.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        IndexAddSYCL_(dim, index, src_permute, dst_permute);
+#endif
     } else if (dst.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         IndexAddCUDA_(dim, index, src_permute, dst_permute);
diff --git a/cpp/open3d/core/kernel/IndexReduction.h b/cpp/open3d/core/kernel/IndexReduction.h
index f4fd3516b41..2b4e122f3f0 100644
--- a/cpp/open3d/core/kernel/IndexReduction.h
+++ b/cpp/open3d/core/kernel/IndexReduction.h
@@ -24,6 +24,13 @@ void IndexAddCPU_(int64_t dim,
                   const Tensor& src,
                   Tensor& dst);
 
+#ifdef BUILD_SYCL_MODULE
+void IndexAddSYCL_(int64_t dim,
+                   const Tensor& index,
+                   const Tensor& src,
+                   Tensor& dst);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void IndexAddCUDA_(int64_t dim,
                    const Tensor& index,
diff --git a/cpp/open3d/core/kernel/IndexReductionSYCL.cpp b/cpp/open3d/core/kernel/IndexReductionSYCL.cpp
new file mode 100644
index 00000000000..47da284dc93
--- /dev/null
+++ b/cpp/open3d/core/kernel/IndexReductionSYCL.cpp
@@ -0,0 +1,61 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/Indexer.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+void IndexAddSYCL_(int64_t dim,
+                   const Tensor& index,
+                   const Tensor& src,
+                   Tensor& dst) {
+    // index: [N,], src: [N, D], dst: [M, D]
+    // In Indexer, output shape defines the actual primary strides.
+    // However, in IndexAdd_, input dominates the iterations.
+    // So put dst (output) at indexer's input, and src (input) at output.
+    Indexer indexer({dst}, src, DtypePolicy::NONE);
+
+    // Index is simply a 1D contiguous tensor, with a different stride
+    // behavior to src. So use raw pointer for simplicity.
+    auto index_ptr = index.GetDataPtr<int64_t>();
+
+    int64_t broadcasting_elems = 1;
+    for (int64_t d = 1; d < src.NumDims(); ++d) {
+        broadcasting_elems *= src.GetShape(d);
+    }
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice());
+
+    // TODO: Replace with SYCL reduction API
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
+        queue.parallel_for(index.GetLength(), [=](int64_t workload_idx) {
+                 int64_t reduction_idx = workload_idx / broadcasting_elems;
+                 int64_t broadcasting_idx = workload_idx % broadcasting_elems;
+
+                 const int64_t idx = index_ptr[reduction_idx];
+                 int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;
+
+                 // Note input and output is switched here to adapt to the
+                 // indexer
+                 scalar_t* src_ptr = indexer.GetOutputPtr<scalar_t>(0, idx);
+                 scalar_t* dst_ptr = indexer.GetInputPtr<scalar_t>(0, dst_idx);
+                 sycl::atomic_ref<scalar_t, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(*dst_ptr) +=
+                         *src_ptr;
+             }).wait_and_throw();
+    });
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/NonZero.cpp b/cpp/open3d/core/kernel/NonZero.cpp
index 6d04f94e40a..686ce9ea885 100644
--- a/cpp/open3d/core/kernel/NonZero.cpp
+++ b/cpp/open3d/core/kernel/NonZero.cpp
@@ -18,6 +18,12 @@ namespace kernel {
 Tensor NonZero(const Tensor& src) {
     if (src.IsCPU()) {
         return NonZeroCPU(src);
+    } else if (src.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        return NonZeroSYCL(src);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (src.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         return NonZeroCUDA(src);
diff --git a/cpp/open3d/core/kernel/NonZero.h b/cpp/open3d/core/kernel/NonZero.h
index ab926ddd8c3..36ac8b4f5a0 100644
--- a/cpp/open3d/core/kernel/NonZero.h
+++ b/cpp/open3d/core/kernel/NonZero.h
@@ -17,6 +17,10 @@ Tensor NonZero(const Tensor& src);
 
 Tensor NonZeroCPU(const Tensor& src);
 
+#ifdef BUILD_SYCL_MODULE
+Tensor NonZeroSYCL(const Tensor& src);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 Tensor NonZeroCUDA(const Tensor& src);
 #endif
diff --git a/cpp/open3d/core/kernel/NonZeroSYCL.cpp b/cpp/open3d/core/kernel/NonZeroSYCL.cpp
new file mode 100644
index 00000000000..9a9795e9f6a
--- /dev/null
+++ b/cpp/open3d/core/kernel/NonZeroSYCL.cpp
@@ -0,0 +1,75 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <numeric>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+
+#include "open3d/core/Indexer.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/kernel/NonZero.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+/// Maximum number of dimensions of TensorRef, rounded up to the power of 2 for
+/// sycl::vec support.
+static constexpr int64_t MAX_DIMS_POW2 = 4;
+static_assert(MAX_DIMS_POW2 >= MAX_DIMS, "MAX_DIMS_POW2 too small.");
+namespace kernel {
+
+Tensor NonZeroSYCL(const Tensor& src) {
+    // Get flattened non-zero indices.
+    TensorIterator src_iter(src);
+    const int64_t num_elements = src.NumElements();
+    auto device = src.GetDevice();
+    Tensor indices = Tensor::Arange(0, num_elements, 1, core::Int64, device);
+    Tensor non_zero_indices(SizeVector({num_elements}), Int64, device);
+    int64_t *non_zero_indices_ptr = non_zero_indices.GetDataPtr<int64_t>(),
+            *indices_ptr = indices.GetDataPtr<int64_t>();
+    size_t num_non_zeros;
+    DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src.GetDtype(), [&]() {
+        auto it = std::copy_if(
+                oneapi::dpl::execution::dpcpp_default, indices_ptr,
+                indices_ptr + num_elements, non_zero_indices_ptr,
+                [src_iter](int64_t index) {
+                    auto src_ptr = static_cast<const scalar_t*>(
+                            src_iter.GetPtr(index));
+                    OPEN3D_ASSERT(src_ptr != nullptr && "Internal error.");
+                    return *src_ptr != 0;
+                });
+        num_non_zeros = std::distance(non_zero_indices_ptr, it);
+    });
+
+    // Transform flattened indices to indices in each dimension.
+    const auto num_dims = src.NumDims();
+    SizeVector shape = src.GetShape();
+    sycl::vec<int64_t, MAX_DIMS_POW2> shape_vec;  // device copyable
+    OPEN3D_ASSERT(shape.size() <= MAX_DIMS_POW2 && "Too many dimensions.");
+    for (auto k = 0; k < num_dims; ++k) shape_vec[k] = shape[k];
+    Tensor result({num_dims, static_cast<int64_t>(num_non_zeros)}, Int64,
+                  device);
+    int64_t* result_ptr = result.GetDataPtr<int64_t>();
+    auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    queue.parallel_for(num_non_zeros, [=](int64_t i) {
+             auto non_zero_index = non_zero_indices_ptr[i];
+             auto this_result_ptr =
+                     result_ptr + i + (num_dims - 1) * num_non_zeros;
+             OPEN3D_ASSERT(this_result_ptr != nullptr && "Internal error.");
+             for (auto dim = num_dims - 1; dim >= 0;
+                  dim--, this_result_ptr -= num_non_zeros) {
+                 *this_result_ptr = non_zero_index % shape_vec[dim];
+                 non_zero_index = non_zero_index / shape_vec[dim];
+             }
+         }).wait_and_throw();
+    return result;
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/Reduction.cpp b/cpp/open3d/core/kernel/Reduction.cpp
index a94087d609a..2b3052a2c48 100644
--- a/cpp/open3d/core/kernel/Reduction.cpp
+++ b/cpp/open3d/core/kernel/Reduction.cpp
@@ -71,6 +71,18 @@ void Reduction(const Tensor& src,
 
     if (src.IsCPU()) {
         ReductionCPU(src, dst, dims, keepdim, op_code);
+    } else if (src.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        // Tensor dst_cpu =
+        //         Tensor::Empty(dst.GetShape(), dst.GetDtype(),
+        //         Device("SYCL:1"));
+        // ReductionSYCL(src.To(Device("SYCL:1")), dst_cpu, dims, keepdim,
+        //               op_code);
+        // dst.CopyFrom(dst_cpu);
+        ReductionSYCL(src, dst, dims, keepdim, op_code);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (src.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         ReductionCUDA(src, dst, dims, keepdim, op_code);
diff --git a/cpp/open3d/core/kernel/Reduction.h b/cpp/open3d/core/kernel/Reduction.h
index a1a4b74e3a3..ec3bec62ab7 100644
--- a/cpp/open3d/core/kernel/Reduction.h
+++ b/cpp/open3d/core/kernel/Reduction.h
@@ -59,6 +59,14 @@ void ReductionCPU(const Tensor& src,
                   bool keepdim,
                   ReductionOpCode op_code);
 
+#ifdef BUILD_SYCL_MODULE
+void ReductionSYCL(const Tensor& src,
+                   Tensor& dst,
+                   const SizeVector& dims,
+                   bool keepdim,
+                   ReductionOpCode op_code);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void ReductionCUDA(const Tensor& src,
                    Tensor& dst,
diff --git a/cpp/open3d/core/kernel/ReductionSYCL.cpp b/cpp/open3d/core/kernel/ReductionSYCL.cpp
new file mode 100644
index 00000000000..b4c96112866
--- /dev/null
+++ b/cpp/open3d/core/kernel/ReductionSYCL.cpp
@@ -0,0 +1,164 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <limits>
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/Indexer.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/kernel/Reduction.h"
+#include "open3d/utility/Logging.h"
+#include "open3d/utility/Parallel.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+namespace {
+// Based on OneAPI GPU optimization guide code sample (Blocked access to input
+// data + SYCL builtin reduction ops for final reduction)
+template <class ReductionOp, typename scalar_t>
+void SYCLReductionEngine(
+        Device device,
+        Indexer indexer,
+        scalar_t identity =
+                sycl::known_identity<ReductionOp, scalar_t>::value) {
+    auto device_props =
+            sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    auto queue = device_props.queue;
+    auto work_group_size = device_props.max_work_group_size;
+    size_t log2elements_per_group = 13;
+    auto elements_per_group = (1 << log2elements_per_group);  // 8192
+    size_t log2workitems_per_group = 8;
+    auto workitems_per_group = (1 << log2workitems_per_group);  // 256
+    auto elements_per_work_item =
+            elements_per_group / workitems_per_group;  // 32 (= max SIMD sizse)
+    auto mask = ~(~0 << log2workitems_per_group);
+    ReductionOp red_op;
+    for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements();
+         output_idx++) {
+        // sub_indexer.NumWorkloads() == ipo.
+        // sub_indexer's workload_idx is indexer's ipo_idx.
+        Indexer scalar_out_indexer = indexer.GetPerOutputIndexer(output_idx);
+        auto num_elements = scalar_out_indexer.NumWorkloads();
+        auto num_work_groups = num_elements / elements_per_group;
+        if (num_elements > elements_per_group * num_work_groups)
+            ++num_work_groups;
+        // ensure each work group has work_group_size work items
+        auto num_work_items = num_work_groups * work_group_size;
+        auto output =
+                reinterpret_cast<scalar_t*>(scalar_out_indexer.GetOutputPtr(0));
+        auto e = queue.submit([&](auto& cgh) {
+            auto sycl_reducer = sycl::reduction(
+                    output, identity, red_op,
+                    {sycl::property::reduction::initialize_to_identity()});
+            cgh.parallel_for(
+                    sycl::nd_range<1>{num_work_items, work_group_size},
+                    sycl_reducer, [=](sycl::nd_item<1> item, auto& red_arg) {
+                        auto glob_id = item.get_global_id(0);
+                        auto offset = ((glob_id >> log2workitems_per_group)
+                                       << log2elements_per_group) +
+                                      (glob_id & mask);
+                        auto item_out = identity;
+                        for (size_t i = 0; i < elements_per_work_item; i++) {
+                            size_t idx =
+                                    (i << log2workitems_per_group) + offset;
+                            if (idx >= num_elements) break;
+                            scalar_t* val = reinterpret_cast<scalar_t*>(
+                                    scalar_out_indexer.GetInputPtr(0, idx));
+                            item_out = red_op(item_out, *val);
+                        }
+                        red_arg.combine(item_out);
+                    });
+        });
+    }
+    queue.wait_and_throw();
+}
+}  // namespace
+
+void ReductionSYCL(const Tensor& src,
+                   Tensor& dst,
+                   const SizeVector& dims,
+                   bool keepdim,
+                   ReductionOpCode op_code) {
+    Device device = src.GetDevice();
+    if (s_regular_reduce_ops.find(op_code) != s_regular_reduce_ops.end()) {
+        Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims);
+        DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
+            switch (op_code) {
+                case ReductionOpCode::Sum:
+                    dst.Fill(0);
+                    SYCLReductionEngine<sycl::plus<scalar_t>, scalar_t>(
+                            device, indexer);
+                    break;
+                case ReductionOpCode::Prod:
+                    dst.Fill(1);
+                    SYCLReductionEngine<sycl::multiplies<scalar_t>, scalar_t>(
+                            device, indexer);
+                    break;
+                case ReductionOpCode::Min:
+                    if (indexer.NumWorkloads() == 0) {
+                        utility::LogError(
+                                "Zero-size Tensor does not support Min.");
+                    } else {
+                        SYCLReductionEngine<sycl::minimum<scalar_t>, scalar_t>(
+                                device, indexer);
+                    }
+                    break;
+                case ReductionOpCode::Max:
+                    if (indexer.NumWorkloads() == 0) {
+                        utility::LogError(
+                                "Zero-size Tensor does not support Max.");
+                    } else {
+                        SYCLReductionEngine<sycl::maximum<scalar_t>, scalar_t>(
+                                device, indexer);
+                    }
+                    break;
+                default:
+                    utility::LogError("Unsupported op code.");
+                    break;
+            }
+        });
+    } else if (s_arg_reduce_ops.find(op_code) != s_arg_reduce_ops.end()) {
+        utility::LogError("SYCL Arg-reduction is not implemented.");
+    } else if (s_boolean_reduce_ops.find(op_code) !=
+               s_boolean_reduce_ops.end()) {
+        if (src.GetDtype() != core::Bool) {
+            utility::LogError(
+                    "Boolean reduction only supports boolean input tensor.");
+        }
+        if (dst.GetDtype() != core::Bool) {
+            utility::LogError(
+                    "Boolean reduction only supports boolean output tensor.");
+        }
+        Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims);
+        switch (op_code) {
+            case ReductionOpCode::All:
+                // Identity == true. 0-sized tensor, returns true.
+                dst.Fill(true);
+                SYCLReductionEngine<sycl::logical_and<bool>, bool>(device,
+                                                                   indexer);
+                break;
+            case ReductionOpCode::Any:
+                // Identity == false. 0-sized tensor, returns false.
+                dst.Fill(false);
+                SYCLReductionEngine<sycl::logical_or<bool>, bool>(device,
+                                                                  indexer);
+                break;
+            default:
+                utility::LogError("Unsupported op code.");
+                break;
+        }
+    } else {
+        utility::LogError("Unsupported op code.");
+    }
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/UnaryEW.cpp b/cpp/open3d/core/kernel/UnaryEW.cpp
index 911b2885e34..2513d41205d 100644
--- a/cpp/open3d/core/kernel/UnaryEW.cpp
+++ b/cpp/open3d/core/kernel/UnaryEW.cpp
@@ -22,6 +22,19 @@ void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
                           src.GetShape(), dst.GetShape());
     }
 
+    // Check dtype compatibility
+    const auto float_only_ops = {UnaryEWOpCode::Sqrt,    UnaryEWOpCode::Sin,
+                                 UnaryEWOpCode::Cos,     UnaryEWOpCode::Exp,
+                                 UnaryEWOpCode::IsNan,   UnaryEWOpCode::IsInf,
+                                 UnaryEWOpCode::IsFinite};
+    Dtype src_dtype = src.GetDtype();
+    if (std::find(float_only_ops.begin(), float_only_ops.end(), op_code) !=
+                float_only_ops.end() &&
+        src_dtype != core::Float32 && src_dtype != core::Float64) {
+        utility::LogError("Only supports Float32 and Float64, but {} is used.",
+                          src_dtype.ToString());
+    }
+
     // Dispatch to device
     Device src_device = src.GetDevice();
     Device dst_device = dst.GetDevice();
@@ -32,6 +45,12 @@ void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
 
     if (src_device.IsCPU()) {
         UnaryEWCPU(src, dst, op_code);
+    } else if (src_device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        UnaryEWSYCL(src, dst, op_code);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (src_device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         UnaryEWCUDA(src, dst, op_code);
@@ -73,6 +92,8 @@ void Copy(const Tensor& src, Tensor& dst) {
 #else
         utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
+    } else {
+        utility::LogError("Copy: SYCL <-> CUDA Unimplemented device");
     }
 }
 
diff --git a/cpp/open3d/core/kernel/UnaryEW.h b/cpp/open3d/core/kernel/UnaryEW.h
index 907b3371167..131dff08b91 100644
--- a/cpp/open3d/core/kernel/UnaryEW.h
+++ b/cpp/open3d/core/kernel/UnaryEW.h
@@ -32,15 +32,18 @@ enum class UnaryEWOpCode {
 };
 
 void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code);
-
 void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code);
 
+#ifdef BUILD_SYCL_MODULE
+void UnaryEWSYCL(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void UnaryEWCUDA(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code);
 #endif
 
-// Copy is separated from other unary ops since it support cross-device copy and
-// dtype casting.
+// Copy is separated from other unary ops since it supports cross-device copy
+// and dtype casting.
 void Copy(const Tensor& src, Tensor& dst);
 
 void CopyCPU(const Tensor& src, Tensor& dst);
diff --git a/cpp/open3d/core/kernel/UnaryEWCPU.cpp b/cpp/open3d/core/kernel/UnaryEWCPU.cpp
index 1a502e42a1d..f4673200b24 100644
--- a/cpp/open3d/core/kernel/UnaryEWCPU.cpp
+++ b/cpp/open3d/core/kernel/UnaryEWCPU.cpp
@@ -218,14 +218,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
     Dtype src_dtype = src.GetDtype();
     Dtype dst_dtype = dst.GetDtype();
 
-    auto assert_dtype_is_float = [](Dtype dtype) -> void {
-        if (dtype != core::Float32 && dtype != core::Float64) {
-            utility::LogError(
-                    "Only supports Float32 and Float64, but {} is used.",
-                    dtype.ToString());
-        }
-    };
-
     if (op_code == UnaryEWOpCode::LogicalNot) {
         if (dst_dtype == src_dtype) {
             Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
@@ -259,7 +251,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
     } else if (op_code == UnaryEWOpCode::IsNan ||
                op_code == UnaryEWOpCode::IsInf ||
                op_code == UnaryEWOpCode::IsFinite) {
-        assert_dtype_is_float(src_dtype);
         Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
 #ifdef BUILD_ISPC_MODULE
         ispc::Indexer ispc_indexer = indexer.ToISPC();
@@ -291,7 +282,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
         DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
             switch (op_code) {
                 case UnaryEWOpCode::Sqrt:
-                    assert_dtype_is_float(src_dtype);
                     LaunchUnaryEWKernel<scalar_t, scalar_t>(
                             indexer, CPUSqrtElementKernel<scalar_t>,
                             OPEN3D_TEMPLATE_VECTORIZED(scalar_t,
@@ -299,7 +289,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
                                                        &ispc_indexer));
                     break;
                 case UnaryEWOpCode::Sin:
-                    assert_dtype_is_float(src_dtype);
                     LaunchUnaryEWKernel<scalar_t, scalar_t>(
                             indexer, CPUSinElementKernel<scalar_t>,
                             OPEN3D_TEMPLATE_VECTORIZED(scalar_t,
@@ -307,7 +296,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
                                                        &ispc_indexer));
                     break;
                 case UnaryEWOpCode::Cos:
-                    assert_dtype_is_float(src_dtype);
                     LaunchUnaryEWKernel<scalar_t, scalar_t>(
                             indexer, CPUCosElementKernel<scalar_t>,
                             OPEN3D_TEMPLATE_VECTORIZED(scalar_t,
@@ -322,7 +310,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
                                                        &ispc_indexer));
                     break;
                 case UnaryEWOpCode::Exp:
-                    assert_dtype_is_float(src_dtype);
                     LaunchUnaryEWKernel<scalar_t, scalar_t>(
                             indexer, CPUExpElementKernel<scalar_t>,
                             OPEN3D_TEMPLATE_VECTORIZED(scalar_t,
diff --git a/cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp b/cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp
new file mode 100644
index 00000000000..fed05af7b47
--- /dev/null
+++ b/cpp/open3d/core/kernel/UnaryEWSYCL.bak.cpp
@@ -0,0 +1,53 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <cmath>
+#include <cstring>
+
+#include "open3d/core/Dtype.h"
+#include "open3d/core/MemoryManager.h"
+#include "open3d/core/SizeVector.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/kernel/UnaryEW.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+void CopySYCL(const Tensor& src, Tensor& dst) {
+    // It has been checked that
+    // - at least one of src or dst is SYCL device
+    SizeVector shape = src.GetShape();
+    Dtype src_dtype = src.GetDtype();
+    Dtype dst_dtype = dst.GetDtype();
+    Device dst_device = dst.GetDevice();
+    Device src_device = src.GetDevice();
+
+    if (src_dtype != dst_dtype) {
+        utility::LogError(
+                "CopySYCL: Dtype conversion from src to dst not implemented!");
+    }
+    if ((dst_device.IsSYCL() && !dst.IsContiguous()) ||
+        (src_device.IsSYCL() && !src.IsContiguous())) {
+        utility::LogError(
+                "CopySYCL: NonContiguous SYCL tensor Copy not implemented!");
+    }
+    Tensor src_conti = src.Contiguous();  // No op if already contiguous
+    if (dst.IsContiguous() && src.GetShape() == dst.GetShape() &&
+        src_dtype == dst_dtype) {
+        MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
+                              src_conti.GetDataPtr(), src_conti.GetDevice(),
+                              src_dtype.ByteSize() * shape.NumElements());
+    } else {
+        dst.CopyFrom(src_conti.To(dst_device));
+    }
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/UnaryEWSYCL.cpp b/cpp/open3d/core/kernel/UnaryEWSYCL.cpp
index fed05af7b47..40592cf6ded 100644
--- a/cpp/open3d/core/kernel/UnaryEWSYCL.cpp
+++ b/cpp/open3d/core/kernel/UnaryEWSYCL.cpp
@@ -8,8 +8,12 @@
 #include <cmath>
 #include <cstring>
 
+#include "open3d/core/Dispatch.h"
 #include "open3d/core/Dtype.h"
+#include "open3d/core/Indexer.h"
 #include "open3d/core/MemoryManager.h"
+#include "open3d/core/ParallelFor.h"
+#include "open3d/core/ParallelForSYCL.h"
 #include "open3d/core/SizeVector.h"
 #include "open3d/core/Tensor.h"
 #include "open3d/core/kernel/UnaryEW.h"
@@ -19,32 +23,274 @@ namespace open3d {
 namespace core {
 namespace kernel {
 
+namespace {
+
+struct UnaryElementKernel {
+    UnaryElementKernel(Indexer indexer_) : indexer(indexer_) {}
+    void operator()(int64_t i) {}
+
+protected:
+    Indexer indexer;
+};
+
+template <typename src_t, typename dst_t>
+struct CopyElementKernel : public UnaryElementKernel {
+    using UnaryElementKernel::UnaryElementKernel;
+    void operator()(int64_t i) {
+        const src_t* src = indexer.GetInputPtr<src_t>(0, i);
+        dst_t* dst = indexer.GetOutputPtr<dst_t>(i);
+        *dst = static_cast<dst_t>(*src);
+    }
+};
+
+// Math: integers treated as double (C++11)
+// no casting needed for float
+#define UNARY_ELEMENT_KERNEL(name, elem_op)                                \
+    template <typename src_t>                                              \
+    struct name##ElementKernel : public UnaryElementKernel {               \
+        using UnaryElementKernel::UnaryElementKernel;                      \
+        void operator()(int64_t i) {                                       \
+            const src_t* src = indexer.GetInputPtr<src_t>(0, i);           \
+            src_t* dst = indexer.GetOutputPtr<src_t>(i);                   \
+            *dst = static_cast<src_t>(elem_op(static_cast<double>(*src))); \
+        }                                                                  \
+    };                                                                     \
+    template <>                                                            \
+    struct name##ElementKernel<float> : public UnaryElementKernel {        \
+        using UnaryElementKernel::UnaryElementKernel;                      \
+        void operator()(int64_t i) {                                       \
+            const float* src = indexer.GetInputPtr<float>(0, i);           \
+            float* dst = indexer.GetOutputPtr<float>(i);                   \
+            *dst = elem_op(*src);                                          \
+        }                                                                  \
+    }
+
+UNARY_ELEMENT_KERNEL(Sqrt, sycl::sqrt);
+UNARY_ELEMENT_KERNEL(Sin, sycl::sin);
+UNARY_ELEMENT_KERNEL(Cos, sycl::cos);
+UNARY_ELEMENT_KERNEL(Exp, sycl::exp);
+// TODO: Use sycl::abs for integers (no casting)
+UNARY_ELEMENT_KERNEL(Abs, sycl::fabs);
+UNARY_ELEMENT_KERNEL(Floor, sycl::floor);
+UNARY_ELEMENT_KERNEL(Ceil, sycl::ceil);
+UNARY_ELEMENT_KERNEL(Round, sycl::round);
+UNARY_ELEMENT_KERNEL(Trunc, sycl::trunc);
+#undef UNARY_ELEMENT_KERNEL
+
+// No special treatment for unsigned types - we use the SYCL runtime
+// default
+template <typename scalar_t>
+struct NegElementKernel : public UnaryElementKernel {
+    using UnaryElementKernel::UnaryElementKernel;
+    void operator()(int64_t i) {
+        const scalar_t* src = indexer.GetInputPtr<scalar_t>(0, i);
+        scalar_t* dst = indexer.GetOutputPtr<scalar_t>(i);
+        *dst = -*src;
+    }
+};
+
+// Float checkers: integers treated as double (C++11)
+// no casting needed for float
+#define UNARY_ELEMENT_KERNEL(name, elem_op)                         \
+    template <typename src_t>                                       \
+    struct name##ElementKernel : public UnaryElementKernel {        \
+        using UnaryElementKernel::UnaryElementKernel;               \
+        void operator()(int64_t i) {                                \
+            const src_t* src = indexer.GetInputPtr<src_t>(0, i);    \
+            bool* dst = indexer.GetOutputPtr<bool>(i);              \
+            *dst = elem_op(static_cast<double>(*src));              \
+        }                                                           \
+    };                                                              \
+    template <>                                                     \
+    struct name##ElementKernel<float> : public UnaryElementKernel { \
+        using UnaryElementKernel::UnaryElementKernel;               \
+        void operator()(int64_t i) {                                \
+            const float* src = indexer.GetInputPtr<float>(0, i);    \
+            bool* dst = indexer.GetOutputPtr<bool>(i);              \
+            *dst = elem_op(*src);                                   \
+        }                                                           \
+    }
+
+UNARY_ELEMENT_KERNEL(IsNan, sycl::isnan);
+UNARY_ELEMENT_KERNEL(IsInf, sycl::isinf);
+UNARY_ELEMENT_KERNEL(IsFinite, sycl::isfinite);
+#undef UNARY_ELEMENT_KERNEL
+
+template <typename src_t, typename dst_t /* == bool or src_t */>
+struct LogicalNotElementKernel : public UnaryElementKernel {
+    using UnaryElementKernel::UnaryElementKernel;
+    void operator()(int64_t i) {
+        const src_t* src = indexer.GetInputPtr<src_t>(0, i);
+        dst_t* dst = indexer.GetOutputPtr<dst_t>(i);
+        *dst = static_cast<dst_t>(!static_cast<bool>(*src));
+    }
+};
+}  // namespace
+
 void CopySYCL(const Tensor& src, Tensor& dst) {
-    // It has been checked that
-    // - at least one of src or dst is SYCL device
+    // src and dst have been checked to have the same shape
+    // at least one of src and dst is SYCL and the other is SYCL or CPU
     SizeVector shape = src.GetShape();
+    Dtype src_dtype = src.GetDtype(), dst_dtype = dst.GetDtype();
+    Device src_device = src.GetDevice(), dst_device = dst.GetDevice();
+    Device device_with_queue = dst.IsSYCL() ? dst.GetDevice() : src.GetDevice();
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(device_with_queue);
+
+    if (src_device.IsSYCL() && dst_device.IsSYCL()) {
+        if (src.IsContiguous() && dst.IsContiguous() &&
+            src.GetShape() == dst.GetShape() && src_dtype == dst_dtype) {
+            MemoryManager::Memcpy(dst.GetDataPtr(), dst.GetDevice(),
+                                  src.GetDataPtr(), src.GetDevice(),
+                                  src_dtype.ByteSize() * shape.NumElements());
+        } else if (dst.NumElements() > 1 && dst.IsContiguous() &&
+                   src.NumElements() == 1 /*&& !src_dtype.IsObject()*/) {
+            int64_t num_elements = dst.NumElements();
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
+                scalar_t scalar_element = src.To(dst_dtype).Item<scalar_t>();
+                scalar_t* dst_ptr = dst.GetDataPtr<scalar_t>();
+                queue.fill(dst_ptr, scalar_element, num_elements)
+                        .wait_and_throw();
+            });
+        } else if (src_device == dst_device) {  // non-contiguous or broadcast
+                                                // on same SYCL device
+            Indexer indexer({src}, dst, DtypePolicy::NONE);
+            if (src.GetDtype().IsObject()) {
+                // TODO: This is likely very slow. Coalesce into less memcpy
+                // calls.
+                int64_t object_byte_size = src.GetDtype().ByteSize();
+                for (int64_t i = 0; i < indexer.NumWorkloads(); ++i) {
+                    const void* src_ptr = indexer.GetInputPtr(0, i);
+                    void* dst_ptr = indexer.GetOutputPtr(i);
+                    queue.memcpy(dst_ptr, src_ptr, object_byte_size);
+                }
+                queue.wait_and_throw();
+            } else {
+                DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                    using src_t = scalar_t;
+                    DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
+                        using dst_t = scalar_t;
+                        ParallelForSYCL<CopyElementKernel<src_t, dst_t>>(
+                                device_with_queue, indexer);
+                    });
+                });
+            }
+        } else {
+            dst.CopyFrom(src.Contiguous().To(dst_device));
+        }
+    } else if (src_device.IsCPU() && dst_device.IsSYCL() ||
+               src_device.IsSYCL() && dst_device.IsCPU()) {
+        Tensor src_conti = src.Contiguous();  // No op if already contiguous
+        if (dst.IsContiguous() && src.GetShape() == dst.GetShape() &&
+            src_dtype == dst_dtype) {
+            MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
+                                  src_conti.GetDataPtr(), src_conti.GetDevice(),
+                                  src_dtype.ByteSize() * shape.NumElements());
+        } else {
+            dst.CopyFrom(src.Contiguous().To(dst_device));
+        }
+    } else {
+        utility::LogError("Wrong device type {} -> {}", src_device.ToString(),
+                          dst_device.ToString());
+    }
+}
+
+void UnaryEWSYCL(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
+    // src and dst have been changed to have the same shape, device
     Dtype src_dtype = src.GetDtype();
     Dtype dst_dtype = dst.GetDtype();
-    Device dst_device = dst.GetDevice();
-    Device src_device = src.GetDevice();
+    Device device = src.GetDevice();  // == dst.GetDevice()
 
-    if (src_dtype != dst_dtype) {
-        utility::LogError(
-                "CopySYCL: Dtype conversion from src to dst not implemented!");
-    }
-    if ((dst_device.IsSYCL() && !dst.IsContiguous()) ||
-        (src_device.IsSYCL() && !src.IsContiguous())) {
-        utility::LogError(
-                "CopySYCL: NonContiguous SYCL tensor Copy not implemented!");
-    }
-    Tensor src_conti = src.Contiguous();  // No op if already contiguous
-    if (dst.IsContiguous() && src.GetShape() == dst.GetShape() &&
-        src_dtype == dst_dtype) {
-        MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
-                              src_conti.GetDataPtr(), src_conti.GetDevice(),
-                              src_dtype.ByteSize() * shape.NumElements());
+    auto assert_dtype_is_float = [](Dtype dtype) -> void {
+        if (dtype != Float32 && dtype != Float64) {
+            utility::LogError(
+                    "Only supports Float32 and Float64, but {} is used.",
+                    dtype.ToString());
+        }
+    };
+
+    if (op_code == UnaryEWOpCode::LogicalNot) {
+        if (dst_dtype == src_dtype) {
+            Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                ParallelForSYCL<LogicalNotElementKernel<scalar_t, scalar_t>>(
+                        device, indexer);
+            });
+        } else if (dst_dtype == Bool) {
+            Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                ParallelForSYCL<LogicalNotElementKernel<scalar_t, bool>>(
+                        device, indexer);
+            });
+        } else {
+            utility::LogError(
+                    "Boolean op's output type must be boolean or the "
+                    "same type as the input.");
+        }
+    } else if (op_code == UnaryEWOpCode::IsNan ||
+               op_code == UnaryEWOpCode::IsInf ||
+               op_code == UnaryEWOpCode::IsFinite) {
+        assert_dtype_is_float(src_dtype);
+        Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
+        DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
+            if (op_code == UnaryEWOpCode::IsNan) {
+                ParallelForSYCL<IsNanElementKernel<scalar_t>>(device, indexer);
+            } else if (op_code == UnaryEWOpCode::IsInf) {
+                ParallelForSYCL<IsInfElementKernel<scalar_t>>(device, indexer);
+            } else if (op_code == UnaryEWOpCode::IsFinite) {
+                ParallelForSYCL<IsFiniteElementKernel<scalar_t>>(device,
+                                                                 indexer);
+            }
+        });
     } else {
-        dst.CopyFrom(src_conti.To(dst_device));
+        Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
+        DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
+            switch (op_code) {
+                case UnaryEWOpCode::Sqrt:
+                    ParallelForSYCL<SqrtElementKernel<scalar_t>>(device,
+                                                                 indexer);
+                    break;
+                case UnaryEWOpCode::Sin:
+                    ParallelForSYCL<SinElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Cos:
+                    ParallelForSYCL<CosElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Neg:
+                    ParallelForSYCL<NegElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Exp:
+                    ParallelForSYCL<ExpElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Abs:
+                    ParallelForSYCL<AbsElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Floor:
+                    ParallelForSYCL<FloorElementKernel<scalar_t>>(device,
+                                                                  indexer);
+                    break;
+                case UnaryEWOpCode::Ceil:
+                    ParallelForSYCL<CeilElementKernel<scalar_t>>(device,
+                                                                 indexer);
+                    break;
+                case UnaryEWOpCode::Round:
+                    ParallelForSYCL<RoundElementKernel<scalar_t>>(device,
+                                                                  indexer);
+                    break;
+                case UnaryEWOpCode::Trunc:
+                    ParallelForSYCL<TruncElementKernel<scalar_t>>(device,
+                                                                  indexer);
+                    break;
+                default:
+                    utility::LogError("Unimplemented op_code for UnaryEWSYCL");
+                    break;
+            }
+        });
     }
 }
 
diff --git a/cpp/open3d/core/linalg/AddMM.cpp b/cpp/open3d/core/linalg/AddMM.cpp
index aea908cecbb..45cfd71df7d 100644
--- a/cpp/open3d/core/linalg/AddMM.cpp
+++ b/cpp/open3d/core/linalg/AddMM.cpp
@@ -98,6 +98,13 @@ void AddMM(const Tensor& A,
                   ldb, lda, ldc, dtype, device);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        AddMMSYCL(B_data, A_data, C_data, n, k, m, alpha, beta, transB, transA,
+                  ldb, lda, ldc, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         AddMMCPU(B_data, A_data, C_data, n, k, m, alpha, beta, transB, transA,
diff --git a/cpp/open3d/core/linalg/AddMM.h b/cpp/open3d/core/linalg/AddMM.h
index 1754e430fef..6d26703ae1b 100644
--- a/cpp/open3d/core/linalg/AddMM.h
+++ b/cpp/open3d/core/linalg/AddMM.h
@@ -20,6 +20,24 @@ namespace core {
 void AddMM(
         const Tensor& A, const Tensor& B, Tensor& C, double alpha, double beta);
 
+#ifdef BUILD_SYCL_MODULE
+void AddMMSYCL(void* A_data,
+               void* B_data,
+               void* C_data,
+               int64_t m,
+               int64_t k,
+               int64_t n,
+               double alpha,
+               double beta,
+               bool gemmTrA,
+               bool gemmTrB,
+               int lda,
+               int ldb,
+               int ldc,
+               Dtype dtype,
+               const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void AddMMCUDA(void* A_data,
                void* B_data,
diff --git a/cpp/open3d/core/linalg/AddMMSYCL.cpp b/cpp/open3d/core/linalg/AddMMSYCL.cpp
new file mode 100644
index 00000000000..591a4f26f00
--- /dev/null
+++ b/cpp/open3d/core/linalg/AddMMSYCL.cpp
@@ -0,0 +1,49 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/AddMM.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+
+void AddMMSYCL(void* A_data,
+               void* B_data,
+               void* C_data,
+               int64_t m,
+               int64_t k,
+               int64_t n,
+               double alpha,
+               double beta,
+               bool gemmTrA,
+               bool gemmTrB,
+               int lda,
+               int ldb,
+               int ldc,
+               Dtype dtype,
+               const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        blas::column_major::gemm(queue, gemmTrA ? transpose::T : transpose::N,
+                                 gemmTrB ? transpose::T : transpose::N, m, n, k,
+                                 static_cast<scalar_t>(alpha),
+                                 static_cast<const scalar_t*>(A_data), lda,
+                                 static_cast<const scalar_t*>(B_data), ldb,
+                                 static_cast<scalar_t>(beta),
+                                 static_cast<scalar_t*>(C_data), ldc)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/Inverse.cpp b/cpp/open3d/core/linalg/Inverse.cpp
index 51cf7693217..07cb507c1ca 100644
--- a/cpp/open3d/core/linalg/Inverse.cpp
+++ b/cpp/open3d/core/linalg/Inverse.cpp
@@ -55,6 +55,20 @@ void Inverse(const Tensor &A, Tensor &output) {
         output = output.T();
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        Tensor ipiv = Tensor::Empty({n}, core::Int64, device);
+        void *ipiv_data = ipiv.GetDataPtr();
+
+        // LAPACKE supports getri, A is in-place modified as output.
+        Tensor A_T = A.T().To(device, /*copy=*/true);
+        void *A_data = A_T.GetDataPtr();
+
+        InverseSYCL(A_data, ipiv_data, nullptr, n, dtype, device);
+        output = A_T.T();
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         Dtype ipiv_dtype;
diff --git a/cpp/open3d/core/linalg/Inverse.h b/cpp/open3d/core/linalg/Inverse.h
index 8809f8f176c..2d3cbb2bbcf 100644
--- a/cpp/open3d/core/linalg/Inverse.h
+++ b/cpp/open3d/core/linalg/Inverse.h
@@ -17,11 +17,20 @@ void Inverse(const Tensor& A, Tensor& output);
 
 void InverseCPU(void* A_data,
                 void* ipiv_data,
-                void* output_data,
+                [[maybe_unused]] void* output_data,
                 int64_t n,
                 Dtype dtype,
                 const Device& device);
 
+#ifdef BUILD_SYCL_MODULE
+void InverseSYCL(void* A_data,
+                 void* ipiv_data,
+                 [[maybe_unused]] void* output_data,
+                 int64_t n,
+                 Dtype dtype,
+                 const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void InverseCUDA(void* A_data,
                  void* ipiv_data,
diff --git a/cpp/open3d/core/linalg/InverseSYCL.cpp b/cpp/open3d/core/linalg/InverseSYCL.cpp
new file mode 100644
index 00000000000..f66dd089cda
--- /dev/null
+++ b/cpp/open3d/core/linalg/InverseSYCL.cpp
@@ -0,0 +1,48 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/Inverse.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+
+namespace open3d {
+namespace core {
+
+void InverseSYCL(void* A_data,
+                 void* ipiv_data,
+                 void* output_data,
+                 int64_t n,
+                 Dtype dtype,
+                 const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    int64_t lda = n;
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        // Use blob to ensure cleanup of scratchpad memory.
+        int64_t scratchpad_size = std::max(
+                lapack::getrf_scratchpad_size<scalar_t>(queue, n, n, lda),
+                lapack::getri_scratchpad_size<scalar_t>(queue, n, lda));
+        core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        auto lu_done =
+                lapack::getrf(queue, n, n, static_cast<scalar_t*>(A_data), lda,
+                              static_cast<int64_t*>(ipiv_data),
+                              static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                              scratchpad_size);
+        lapack::getri(queue, n, static_cast<scalar_t*>(A_data), lda,
+                      static_cast<int64_t*>(ipiv_data),
+                      static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                      scratchpad_size, {lu_done})
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/LU.cpp b/cpp/open3d/core/linalg/LU.cpp
index 5bfafdac425..3a9a4e4788c 100644
--- a/cpp/open3d/core/linalg/LU.cpp
+++ b/cpp/open3d/core/linalg/LU.cpp
@@ -16,9 +16,9 @@ namespace open3d {
 namespace core {
 
 // Get column permutation tensor from ipiv (swapping index array).
-static core::Tensor GetColPermutation(const Tensor& ipiv,
-                                      int number_of_indices,
-                                      int number_of_rows) {
+static Tensor GetColPermutation(const Tensor& ipiv,
+                                int number_of_indices,
+                                int number_of_rows) {
     Tensor full_ipiv =
             Tensor::Arange(0, number_of_rows, 1, core::Int32, Device("CPU:0"));
     Tensor ipiv_cpu = ipiv.To(Device("CPU:0"), core::Int32, /*copy=*/false);
@@ -42,14 +42,14 @@ static void OutputToPLU(const Tensor& output,
                         const Tensor& ipiv,
                         const bool permute_l) {
     int n = output.GetShape()[0];
-    core::Device device = output.GetDevice();
+    Device device = output.GetDevice();
 
     // Get upper and lower matrix from output matrix.
     Triul(output, upper, lower, 0);
     // Get column permutation vector from pivot indices vector.
     Tensor col_permutation = GetColPermutation(ipiv, ipiv.GetShape()[0], n);
     // Creating "Permutation Matrix (P in P.A = L.U)".
-    permutation = core::Tensor::Eye(n, output.GetDtype(), device)
+    permutation = Tensor::Eye(n, output.GetDtype(), device)
                           .IndexGet({col_permutation});
     // Calculating P in A = P.L.U. [P.Inverse() = P.T()].
     permutation = permutation.T().Contiguous();
@@ -88,15 +88,23 @@ void LUIpiv(const Tensor& A, Tensor& ipiv, Tensor& output) {
     // elements as U, (diagonal elements of L are unity), and ipiv array,
     // which has the pivot indices (for 1 <= i <= min(M,N), row i of the
     // matrix was interchanged with row IPIV(i).
+    int64_t ipiv_len = std::min(rows, cols);
     if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         CUDAScopedDevice scoped_device(device);
-        int64_t ipiv_len = std::min(rows, cols);
-        ipiv = core::Tensor::Empty({ipiv_len}, core::Int32, device);
+        ipiv = Tensor::Empty({ipiv_len}, core::Int32, device);
         void* ipiv_data = ipiv.GetDataPtr();
         LUCUDA(A_data, ipiv_data, rows, cols, dtype, device);
 #else
         utility::LogInfo("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ipiv = Tensor::Empty({ipiv_len}, core::Int64, device);
+        void* ipiv_data = ipiv.GetDataPtr();
+        LUSYCL(A_data, ipiv_data, rows, cols, dtype, device);
+#else
+        utility::LogInfo("Unimplemented device.");
 #endif
     } else {
         Dtype ipiv_dtype;
@@ -107,9 +115,7 @@ void LUIpiv(const Tensor& A, Tensor& ipiv, Tensor& output) {
         } else {
             utility::LogError("Unsupported OPEN3D_CPU_LINALG_INT type.");
         }
-
-        int64_t ipiv_len = std::min(rows, cols);
-        ipiv = core::Tensor::Empty({ipiv_len}, ipiv_dtype, device);
+        ipiv = Tensor::Empty({ipiv_len}, ipiv_dtype, device);
         void* ipiv_data = ipiv.GetDataPtr();
         LUCPU(A_data, ipiv_data, rows, cols, dtype, device);
     }
@@ -125,7 +131,7 @@ void LU(const Tensor& A,
     AssertTensorDtypes(A, {Float32, Float64});
 
     // Get output matrix and ipiv.
-    core::Tensor ipiv, output;
+    Tensor ipiv, output;
     LUIpiv(A, ipiv, output);
 
     // Decompose output in P, L, U matrix form.
diff --git a/cpp/open3d/core/linalg/LUImpl.h b/cpp/open3d/core/linalg/LUImpl.h
index 36898fa3de9..fde8f9df385 100644
--- a/cpp/open3d/core/linalg/LUImpl.h
+++ b/cpp/open3d/core/linalg/LUImpl.h
@@ -21,6 +21,15 @@ void LUCPU(void* A_data,
            Dtype dtype,
            const Device& device);
 
+#ifdef BUILD_SYCL_MODULE
+void LUSYCL(void* A_data,
+            void* ipiv_data,
+            int64_t rows,
+            int64_t cols,
+            Dtype dtype,
+            const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void LUCUDA(void* A_data,
             void* ipiv_data,
diff --git a/cpp/open3d/core/linalg/LUSYCL.cpp b/cpp/open3d/core/linalg/LUSYCL.cpp
new file mode 100644
index 00000000000..9368270c685
--- /dev/null
+++ b/cpp/open3d/core/linalg/LUSYCL.cpp
@@ -0,0 +1,42 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/LUImpl.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+
+namespace open3d {
+namespace core {
+
+void LUSYCL(void* A_data,
+            void* ipiv_data,
+            int64_t m,
+            int64_t n,
+            Dtype dtype,
+            const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    int64_t lda = m;
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        // Use blob to ensure cleanup of scratchpad memory.
+        int64_t scratchpad_size =
+                lapack::getrf_scratchpad_size<scalar_t>(queue, m, n, lda);
+        core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        lapack::getrf(queue, m, n, static_cast<scalar_t*>(A_data), lda,
+                      static_cast<int64_t*>(ipiv_data),
+                      static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                      scratchpad_size)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/LeastSquares.cpp b/cpp/open3d/core/linalg/LeastSquares.cpp
index 6340331c5ce..46c520215e1 100644
--- a/cpp/open3d/core/linalg/LeastSquares.cpp
+++ b/cpp/open3d/core/linalg/LeastSquares.cpp
@@ -63,6 +63,12 @@ void LeastSquares(const Tensor &A, const Tensor &B, Tensor &X) {
         LeastSquaresCUDA(A_data, B_data, m, n, k, dtype, device);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        LeastSquaresSYCL(A_data, B_data, m, n, k, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         LeastSquaresCPU(A_data, B_data, m, n, k, dtype, device);
diff --git a/cpp/open3d/core/linalg/LeastSquares.h b/cpp/open3d/core/linalg/LeastSquares.h
index c2e79935f5d..aedc33a5a39 100644
--- a/cpp/open3d/core/linalg/LeastSquares.h
+++ b/cpp/open3d/core/linalg/LeastSquares.h
@@ -25,6 +25,16 @@ void LeastSquaresCUDA(void* A_data,
                       const Device& device);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void LeastSquaresSYCL(void* A_data,
+                      void* B_data,
+                      int64_t m,
+                      int64_t n,
+                      int64_t k,
+                      Dtype dtype,
+                      const Device& device);
+#endif
+
 void LeastSquaresCPU(void* A_data,
                      void* B_data,
                      int64_t m,
diff --git a/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp b/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp
new file mode 100644
index 00000000000..faa9dfb3307
--- /dev/null
+++ b/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp
@@ -0,0 +1,46 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/LeastSquares.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+
+namespace open3d {
+namespace core {
+
+void LeastSquaresSYCL(void* A_data,
+                      void* B_data,
+                      int64_t m,
+                      int64_t n,
+                      int64_t k,
+                      Dtype dtype,
+                      const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    int nrhs = k, lda = m, stride_a = lda * n, ldb = std::max(m, n),
+        stride_b = ldb * nrhs, batch_size = 1;
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        // Use blob to ensure cleanup of scratchpad memory.
+        int64_t scratchpad_size = lapack::gels_batch_scratchpad_size<scalar_t>(
+                queue, transpose::N, m, n, nrhs, lda, stride_a, ldb, stride_b,
+                batch_size);
+        core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        lapack::gels_batch(
+                queue, transpose::N, m, n, nrhs, static_cast<scalar_t*>(A_data),
+                lda, stride_a, static_cast<scalar_t*>(B_data), ldb, stride_b,
+                batch_size, static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                scratchpad_size)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/Matmul.cpp b/cpp/open3d/core/linalg/Matmul.cpp
index 1b616c5f335..012a116b736 100644
--- a/cpp/open3d/core/linalg/Matmul.cpp
+++ b/cpp/open3d/core/linalg/Matmul.cpp
@@ -65,7 +65,13 @@ void Matmul(const Tensor& A, const Tensor& B, Tensor& output) {
     output = Tensor::Empty({m, n}, dtype, device);
     void* C_data = output.GetDataPtr();
 
-    if (device.IsCUDA()) {
+    if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        MatmulSYCL(B_data, A_data, C_data, n, k, m, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         CUDAScopedDevice scoped_device(device);
         MatmulCUDA(B_data, A_data, C_data, n, k, m, dtype, device);
diff --git a/cpp/open3d/core/linalg/Matmul.h b/cpp/open3d/core/linalg/Matmul.h
index da29240b0c4..eeaba408591 100644
--- a/cpp/open3d/core/linalg/Matmul.h
+++ b/cpp/open3d/core/linalg/Matmul.h
@@ -15,6 +15,16 @@ namespace core {
 /// Computes matrix multiplication C = AB.
 void Matmul(const Tensor& A, const Tensor& B, Tensor& C);
 
+#ifdef BUILD_SYCL_MODULE
+void MatmulSYCL(void* A_data,
+                void* B_data,
+                void* C_data,
+                int64_t m,
+                int64_t k,
+                int64_t n,
+                Dtype dtype,
+                const Device& device);
+#endif
 #ifdef BUILD_CUDA_MODULE
 void MatmulCUDA(void* A_data,
                 void* B_data,
diff --git a/cpp/open3d/core/linalg/MatmulSYCL.cpp b/cpp/open3d/core/linalg/MatmulSYCL.cpp
new file mode 100644
index 00000000000..be2d6f94ac7
--- /dev/null
+++ b/cpp/open3d/core/linalg/MatmulSYCL.cpp
@@ -0,0 +1,39 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/AddMM.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+void MatmulSYCL(void* A_data,
+                void* B_data,
+                void* C_data,
+                int64_t m,
+                int64_t k,
+                int64_t n,
+                Dtype dtype,
+                const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        scalar_t alpha = 1, beta = 0;
+        blas::column_major::gemm(queue, transpose::N, transpose::N, m, n, k,
+                                 alpha, static_cast<const scalar_t*>(A_data), m,
+                                 static_cast<const scalar_t*>(B_data), k, beta,
+                                 static_cast<scalar_t*>(C_data), m)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/SVD.cpp b/cpp/open3d/core/linalg/SVD.cpp
index 935d014ef4e..657e79f9c57 100644
--- a/cpp/open3d/core/linalg/SVD.cpp
+++ b/cpp/open3d/core/linalg/SVD.cpp
@@ -40,23 +40,31 @@ void SVD(const Tensor &A, Tensor &U, Tensor &S, Tensor &VT) {
     U = Tensor::Empty({m, m}, dtype, device);
     S = Tensor::Empty({n}, dtype, device);
     VT = Tensor::Empty({n, n}, dtype, device);
-    Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device);
 
     void *A_data = A_T.GetDataPtr();
     void *U_data = U.GetDataPtr();
     void *S_data = S.GetDataPtr();
     void *VT_data = VT.GetDataPtr();
-    void *superb_data = superb.GetDataPtr();
 
-    if (device.IsCUDA()) {
+    if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        SVDSYCL(A_data, U_data, S_data, VT_data, m, n, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         CUDAScopedDevice scoped_device(device);
+        Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device);
+        void *superb_data = superb.GetDataPtr();
         SVDCUDA(A_data, U_data, S_data, VT_data, superb_data, m, n, dtype,
                 device);
 #else
         utility::LogError("Unimplemented device.");
 #endif
     } else {
+        Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device);
+        void *superb_data = superb.GetDataPtr();
         SVDCPU(A_data, U_data, S_data, VT_data, superb_data, m, n, dtype,
                device);
     }
diff --git a/cpp/open3d/core/linalg/SVD.h b/cpp/open3d/core/linalg/SVD.h
index 9e6deefa6c1..51d775d7a9d 100644
--- a/cpp/open3d/core/linalg/SVD.h
+++ b/cpp/open3d/core/linalg/SVD.h
@@ -16,6 +16,17 @@ namespace core {
 /// is a min(m, n), VT is an n x n tensor.
 void SVD(const Tensor& A, Tensor& U, Tensor& S, Tensor& VT);
 
+#ifdef BUILD_SYCL_MODULE
+void SVDSYCL(const void* A_data,
+             void* U_data,
+             void* S_data,
+             void* VT_data,
+             int64_t m,
+             int64_t n,
+             Dtype dtype,
+             const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void SVDCUDA(const void* A_data,
              void* U_data,
diff --git a/cpp/open3d/core/linalg/SVDSYCL.cpp b/cpp/open3d/core/linalg/SVDSYCL.cpp
new file mode 100644
index 00000000000..27abf31728f
--- /dev/null
+++ b/cpp/open3d/core/linalg/SVDSYCL.cpp
@@ -0,0 +1,48 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+#include "open3d/core/linalg/SVD.h"
+
+namespace open3d {
+namespace core {
+
+void SVDSYCL(const void* A_data,
+             void* U_data,
+             void* S_data,
+             void* VT_data,
+             int64_t m,
+             int64_t n,
+             Dtype dtype,
+             const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        int64_t lda = m, ldvt = n, ldu = m;
+        int64_t scratchpad_size = lapack::gesvd_scratchpad_size<scalar_t>(
+                queue, jobsvd::vectors, jobsvd::vectors, m, n, lda, ldu, ldvt);
+        // Use blob to ensure cleanup of scratchpad memory.
+        Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        lapack::gesvd(
+                queue, jobsvd::vectors, jobsvd::vectors, m, n,
+                const_cast<scalar_t*>(static_cast<const scalar_t*>(A_data)),
+                lda, static_cast<scalar_t*>(S_data),
+                static_cast<scalar_t*>(U_data), ldu,
+                static_cast<scalar_t*>(VT_data), ldvt,
+                static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                scratchpad_size)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/Solve.cpp b/cpp/open3d/core/linalg/Solve.cpp
index f1025520fac..ec09e8be3f1 100644
--- a/cpp/open3d/core/linalg/Solve.cpp
+++ b/cpp/open3d/core/linalg/Solve.cpp
@@ -60,7 +60,16 @@ void Solve(const Tensor &A, const Tensor &B, Tensor &X) {
     X = B.T().Clone();
     void *B_data = X.GetDataPtr();
 
-    if (device.IsCUDA()) {
+    if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        Tensor ipiv = Tensor::Empty({n}, core::Int64, device);
+        void *ipiv_data = ipiv.GetDataPtr();
+
+        SolveSYCL(A_data, B_data, ipiv_data, n, k, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         CUDAScopedDevice scoped_device(device);
         Tensor ipiv = Tensor::Empty({n}, core::Int32, device);
diff --git a/cpp/open3d/core/linalg/Solve.h b/cpp/open3d/core/linalg/Solve.h
index 485de7ef0f2..a299c100ca7 100644
--- a/cpp/open3d/core/linalg/Solve.h
+++ b/cpp/open3d/core/linalg/Solve.h
@@ -23,6 +23,16 @@ void SolveCPU(void* A_data,
               Dtype dtype,
               const Device& device);
 
+#ifdef BUILD_SYCL_MODULE
+void SolveSYCL(void* A_data,
+               void* B_data,
+               void* ipiv_data,
+               int64_t n,
+               int64_t k,
+               Dtype dtype,
+               const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void SolveCUDA(void* A_data,
                void* B_data,
diff --git a/cpp/open3d/core/linalg/SolveSYCL.cpp b/cpp/open3d/core/linalg/SolveSYCL.cpp
new file mode 100644
index 00000000000..38d6690ff89
--- /dev/null
+++ b/cpp/open3d/core/linalg/SolveSYCL.cpp
@@ -0,0 +1,44 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+#include "open3d/core/linalg/Solve.h"
+
+namespace open3d {
+namespace core {
+
+void SolveSYCL(void* A_data,
+               void* B_data,
+               void* ipiv_data,
+               int64_t n,
+               int64_t k,
+               Dtype dtype,
+               const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    int64_t nrhs = k, lda = n, ldb = n;
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        int64_t scratchpad_size = lapack::gesv_scratchpad_size<scalar_t>(
+                queue, n, nrhs, lda, ldb);
+        // Use blob to ensure cleanup of scratchpad memory.
+        Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        lapack::gesv(queue, n, nrhs, static_cast<scalar_t*>(A_data), lda,
+                     static_cast<int64_t*>(ipiv_data),
+                     static_cast<scalar_t*>(B_data), ldb,
+                     static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                     scratchpad_size)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/Tri.cpp b/cpp/open3d/core/linalg/Tri.cpp
index 77e72b076a2..b15b864aa6a 100644
--- a/cpp/open3d/core/linalg/Tri.cpp
+++ b/cpp/open3d/core/linalg/Tri.cpp
@@ -42,6 +42,12 @@ void Triu(const Tensor& A, Tensor& output, const int diagonal) {
         TriuCUDA(A.Contiguous(), output, diagonal);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TriuSYCL(A.Contiguous(), output, diagonal);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         TriuCPU(A.Contiguous(), output, diagonal);
@@ -58,6 +64,12 @@ void Tril(const Tensor& A, Tensor& output, const int diagonal) {
         TrilCUDA(A.Contiguous(), output, diagonal);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TrilSYCL(A.Contiguous(), output, diagonal);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         TrilCPU(A.Contiguous(), output, diagonal);
@@ -75,6 +87,12 @@ void Triul(const Tensor& A, Tensor& upper, Tensor& lower, const int diagonal) {
         TriulCUDA(A.Contiguous(), upper, lower, diagonal);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TriulSYCL(A.Contiguous(), upper, lower, diagonal);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         TriulCPU(A.Contiguous(), upper, lower, diagonal);
diff --git a/cpp/open3d/core/linalg/TriImpl.h b/cpp/open3d/core/linalg/TriImpl.h
index 441d9c69c93..db7b204a87c 100644
--- a/cpp/open3d/core/linalg/TriImpl.h
+++ b/cpp/open3d/core/linalg/TriImpl.h
@@ -22,6 +22,17 @@ void TriulCPU(const Tensor& A,
               Tensor& lower,
               const int diagonal = 0);
 
+#ifdef BUILD_SYCL_MODULE
+void TriuSYCL(const Tensor& A, Tensor& output, const int diagonal = 0);
+
+void TrilSYCL(const Tensor& A, Tensor& output, const int diagonal = 0);
+
+void TriulSYCL(const Tensor& A,
+               Tensor& upper,
+               Tensor& lower,
+               const int diagonal = 0);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void TriuCUDA(const Tensor& A, Tensor& output, const int diagonal = 0);
 
diff --git a/cpp/open3d/core/linalg/TriSYCL.cpp b/cpp/open3d/core/linalg/TriSYCL.cpp
new file mode 100644
index 00000000000..3d10f99efc3
--- /dev/null
+++ b/cpp/open3d/core/linalg/TriSYCL.cpp
@@ -0,0 +1,82 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/linalg/TriImpl.h"
+
+namespace open3d {
+namespace core {
+
+void TriuSYCL(const Tensor &A, Tensor &output, const int diagonal) {
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice());
+    DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() {
+        const scalar_t *A_ptr = static_cast<const scalar_t *>(A.GetDataPtr());
+        scalar_t *output_ptr = static_cast<scalar_t *>(output.GetDataPtr());
+        auto rows = static_cast<size_t>(A.GetShape()[0]),
+             cols = static_cast<size_t>(A.GetShape()[1]);
+        queue.parallel_for({cols, rows}, [=](auto wid) {
+                 const auto wid_1d = wid[1] * cols + wid[0];
+                 if (static_cast<int>(wid[0]) - static_cast<int>(wid[1]) >=
+                     diagonal) {
+                     output_ptr[wid_1d] = A_ptr[wid_1d];
+                 }
+             }).wait_and_throw();
+    });
+}
+
+void TrilSYCL(const Tensor &A, Tensor &output, const int diagonal) {
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice());
+    DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() {
+        const scalar_t *A_ptr = static_cast<const scalar_t *>(A.GetDataPtr());
+        scalar_t *output_ptr = static_cast<scalar_t *>(output.GetDataPtr());
+        auto rows = static_cast<size_t>(A.GetShape()[0]),
+             cols = static_cast<size_t>(A.GetShape()[1]);
+        queue.parallel_for({cols, rows}, [=](auto wid) {
+                 const auto wid_1d = wid[1] * cols + wid[0];
+                 if (static_cast<int>(wid[0]) - static_cast<int>(wid[1]) <=
+                     diagonal) {
+                     output_ptr[wid_1d] = A_ptr[wid_1d];
+                 }
+             }).wait_and_throw();
+    });
+}
+
+void TriulSYCL(const Tensor &A,
+               Tensor &upper,
+               Tensor &lower,
+               const int diagonal) {
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice());
+    DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() {
+        const scalar_t *A_ptr = static_cast<const scalar_t *>(A.GetDataPtr());
+        scalar_t *upper_ptr = static_cast<scalar_t *>(upper.GetDataPtr());
+        scalar_t *lower_ptr = static_cast<scalar_t *>(lower.GetDataPtr());
+        auto rows = static_cast<size_t>(A.GetShape()[0]),
+             cols = static_cast<size_t>(A.GetShape()[1]);
+        queue.parallel_for({cols, rows}, [=](auto wid) {
+                 const auto wid_1d = wid[1] * cols + wid[0];
+                 if (static_cast<int>(wid[0]) - static_cast<int>(wid[1]) <
+                     diagonal) {
+                     lower_ptr[wid_1d] = A_ptr[wid_1d];
+                 } else if (static_cast<int>(wid[0]) -
+                                    static_cast<int>(wid[1]) >
+                            diagonal) {
+                     upper_ptr[wid_1d] = A_ptr[wid_1d];
+                 } else {
+                     lower_ptr[wid_1d] = 1;
+                     upper_ptr[wid_1d] = A_ptr[wid_1d];
+                 }
+             }).wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
index aa651596b6d..acc3ddc7e33 100644
--- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt
+++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
@@ -1,4 +1,6 @@
 open3d_ispc_add_library(tgeometry_kernel OBJECT)
+set_target_properties(tgeometry_kernel PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+
 
 target_sources(tgeometry_kernel PRIVATE
     Image.cpp
diff --git a/cpp/open3d/t/geometry/kernel/IPPImage.cpp b/cpp/open3d/t/geometry/kernel/IPPImage.cpp
index ba32d1af317..03a884ad022 100644
--- a/cpp/open3d/t/geometry/kernel/IPPImage.cpp
+++ b/cpp/open3d/t/geometry/kernel/IPPImage.cpp
@@ -20,7 +20,6 @@
 #endif
 
 #include "open3d/core/Dtype.h"
-#include "open3d/core/ParallelFor.h"
 #include "open3d/core/ShapeUtil.h"
 #include "open3d/core/Tensor.h"
 #include "open3d/t/geometry/Image.h"
diff --git a/cpp/tests/core/CMakeLists.txt b/cpp/tests/core/CMakeLists.txt
index c2839eacbae..e6de49e6ed6 100644
--- a/cpp/tests/core/CMakeLists.txt
+++ b/cpp/tests/core/CMakeLists.txt
@@ -36,3 +36,11 @@ if (BUILD_ISPC_MODULE)
         ParallelFor.ispc
     )
 endif()
+
+# if (BUILD_SYCL_MODULE)
+#     target_sources(tests PRIVATE
+#         ParallelForSYCL.cpp
+#     )
+#     set_source_files_properties(ParallelForSYCL.cpp PROPERTIES
+#         COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64_gen")
+# endif()
\ No newline at end of file
diff --git a/cpp/tests/core/CoreTest.cpp b/cpp/tests/core/CoreTest.cpp
index 8f1486ce4c6..320a50fa01f 100644
--- a/cpp/tests/core/CoreTest.cpp
+++ b/cpp/tests/core/CoreTest.cpp
@@ -16,6 +16,12 @@
 #include "open3d/core/SizeVector.h"
 
 namespace open3d {
+namespace core {
+void PrintTo(const Device &device, std::ostream *os) {
+    *os << device.ToString();
+}
+void PrintTo(const Dtype &dtype, std::ostream *os) { *os << dtype.ToString(); }
+}  // namespace core
 namespace tests {
 
 std::vector<core::Dtype> PermuteDtypesWithBool::TestCases() {
@@ -46,6 +52,7 @@ std::vector<core::Device> PermuteDevices::TestCases() {
         devices.push_back(cuda_devices[1]);
     }
 
+    utility::LogWarning("Total {} devices.", devices.size());
     return devices;
 }
 
@@ -54,7 +61,7 @@ std::vector<core::Device> PermuteDevicesWithSYCL::TestCases() {
     std::vector<core::Device> sycl_devices =
             core::Device::GetAvailableSYCLDevices();
     if (!sycl_devices.empty()) {
-        devices.push_back(sycl_devices[0]);
+        devices.push_back(sycl_devices[0]);  // only the first SYCL device
     }
     return devices;
 }
diff --git a/cpp/tests/core/CoreTest.h b/cpp/tests/core/CoreTest.h
index ab874b8b485..f8c6f5db273 100644
--- a/cpp/tests/core/CoreTest.h
+++ b/cpp/tests/core/CoreTest.h
@@ -15,6 +15,10 @@
 #include "tests/Tests.h"
 
 namespace open3d {
+namespace core {
+void PrintTo(const Device &device, std::ostream *os);
+void PrintTo(const Dtype &dtype, std::ostream *os);
+}  // namespace core
 namespace tests {
 
 class PermuteDtypesWithBool : public testing::TestWithParam<core::Dtype> {
diff --git a/cpp/tests/core/Linalg.cpp b/cpp/tests/core/Linalg.cpp
index 2c8be12323b..9b817ae61f8 100644
--- a/cpp/tests/core/Linalg.cpp
+++ b/cpp/tests/core/Linalg.cpp
@@ -23,10 +23,11 @@
 namespace open3d {
 namespace tests {
 
-class LinalgPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(Linalg,
-                         LinalgPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class LinalgPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        Linalg,
+        LinalgPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(LinalgPermuteDevices, Matmul) {
     const float EPSILON = 1e-8;
diff --git a/cpp/tests/core/ParallelForSYCL.cpp b/cpp/tests/core/ParallelForSYCL.cpp
new file mode 100644
index 00000000000..ff78804faea
--- /dev/null
+++ b/cpp/tests/core/ParallelForSYCL.cpp
@@ -0,0 +1,65 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/ParallelForSYCL.h"
+
+#include <vector>
+
+#include "open3d/Macro.h"
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/Dtype.h"
+#include "open3d/core/Tensor.h"
+#include "tests/Tests.h"
+#include "tests/core/CoreTest.h"
+
+struct TestIndexerFillKernel {
+    TestFillKernel(const core::Indexer &indexer_, int64_t multiplier_)
+        : indexer(indexer_), multiplier(multiplier_) {}
+    void operator()(int64_t idx) {
+        indexer.GetOutputPtr<int64_t>(0)[idx] = idx * multiplier;
+    }
+
+private:
+    core::Indexer indexer;
+    int64_t multiplier;
+};
+
+struct TestPtrFillKernel {
+    TestFillKernel(int64_t *out_, int64_t multiplier_)
+        : out(out_), multiplier(multiplier_) {}
+    void operator()(int64_t idx) { out[idx] = idx * multiplier; }
+
+private:
+    int64_t *out;
+    int64_t multiplier;
+};
+
+TEST(ParallelForSYCL, FunctorSYCL) {
+    const core::Device device("SYCL:0");
+    const size_t N = 10000000;
+    core::Indexer indexer({}, tensor, DtypePolicy::NONE);
+    int64_t multiplier = 2;
+
+    {
+        core::Tensor tensor({N, 1}, core::Int64, device);
+        core::ParallelForSYCL<TestIndexerFillKernel>(device, indexer,
+                                                     multiplier);
+        auto result = tensor.To(core::Device()).GetDataPtr<int64_t>();
+        for (int64_t i = 0; i < tensor.NumElements(); ++i) {
+            ASSERT_EQ(result[i], i * multiplier);
+        }
+    }
+    {
+        core::Tensor tensor({N, 1}, core::Int64, device);
+        core::ParallelForSYCL<TestPtrFillKernel>(
+                device, N, tensor.GetDataPtr<int64_t>(), multiplier);
+        auto result = tensor.To(core::Device()).GetDataPtr<int64_t>();
+        for (int64_t i = 0; i < tensor.NumElements(); ++i) {
+            ASSERT_EQ(result[i], i * multiplier);
+        }
+    }
+}
\ No newline at end of file
diff --git a/cpp/tests/core/Tensor.cpp b/cpp/tests/core/Tensor.cpp
index 9f788116be7..0106c5ec6ec 100644
--- a/cpp/tests/core/Tensor.cpp
+++ b/cpp/tests/core/Tensor.cpp
@@ -95,7 +95,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ConstructorBool) {
     EXPECT_EQ(t.GetDtype(), dtype);
 }
 
-TEST_P(TensorPermuteDevices, WithInitValue) {
+TEST_P(TensorPermuteDevicesWithSYCL, WithInitValue) {
     core::Device device = GetParam();
 
     std::vector<float> vals{0, 1, 2, 3, 4, 5};
@@ -223,7 +223,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, WithInitValueSizeMismatch) {
                  std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Arange) {
+TEST_P(TensorPermuteDevicesWithSYCL, Arange) {
     core::Device device = GetParam();
     core::Tensor arange;
 
@@ -266,28 +266,28 @@ TEST_P(TensorPermuteDevices, Arange) {
                  std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Fill) {
+TEST_P(TensorPermuteDevicesWithSYCL, Fill) {
     core::Device device = GetParam();
     core::Tensor t(std::vector<float>(2 * 3, 0), {2, 3}, core::Float32, device);
     t.Fill(1);
     EXPECT_EQ(t.ToFlatVector<float>(), std::vector<float>({1, 1, 1, 1, 1, 1}));
 }
 
-TEST_P(TensorPermuteDevices, FillBool) {
+TEST_P(TensorPermuteDevicesWithSYCL, FillBool) {
     core::Device device = GetParam();
     core::Tensor t(std::vector<bool>(2 * 3, false), {2, 3}, core::Bool, device);
     t.Fill(true);
     EXPECT_EQ(t.ToFlatVector<bool>(), std::vector<bool>(2 * 3, true));
 }
 
-TEST_P(TensorPermuteDevices, FillSlice) {
+TEST_P(TensorPermuteDevicesWithSYCL, FillSlice) {
     core::Device device = GetParam();
     core::Tensor t(std::vector<float>(2 * 3, 0), {2, 3}, core::Float32, device);
     t.Slice(1, 0, 3, 2).Fill(1);  // t[:, 0:3:2].fill(1)
     EXPECT_EQ(t.ToFlatVector<float>(), std::vector<float>({1, 0, 1, 1, 0, 1}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexSetFillFancy) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSetFillFancy) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -348,7 +348,7 @@ TEST_P(TensorPermuteDevicePairsWithSYCL, CopyBool) {
     EXPECT_EQ(dst_t.ToFlatVector<bool>(), vals);
 }
 
-TEST_P(TensorPermuteDevices, To) {
+TEST_P(TensorPermuteDevicesWithSYCL, To) {
     core::Device device = GetParam();
     core::SizeVector shape{2, 3};
 
@@ -364,7 +364,7 @@ TEST_P(TensorPermuteDevices, To) {
     EXPECT_EQ(dst_t.ToFlatVector<int>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevicePairs, ToDevice) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, ToDevice) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -382,7 +382,7 @@ TEST_P(TensorPermuteDevicePairs, ToDevice) {
     EXPECT_ANY_THROW(src_t.To(core::Device("CUDA:100000")));
 }
 
-TEST_P(TensorPermuteDevicePairs, CopyBroadcast) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, CopyBroadcast) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -403,7 +403,7 @@ TEST_P(TensorPermuteDevicePairs, CopyBroadcast) {
     EXPECT_EQ(dst_t.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Expand) {
+TEST_P(TensorPermuteDevicesWithSYCL, Expand) {
     core::Device device = GetParam();
     core::Dtype dtype(core::Float32);
 
@@ -423,7 +423,7 @@ TEST_P(TensorPermuteDevices, Expand) {
     EXPECT_EQ(dst_t.GetDataPtr(), src_t.GetDataPtr());
 }
 
-TEST_P(TensorPermuteDevices, Flatten) {
+TEST_P(TensorPermuteDevicesWithSYCL, Flatten) {
     core::Device device = GetParam();
 
     // Flatten 0-D Tensor.
@@ -562,7 +562,7 @@ TEST_P(TensorPermuteSizesDefaultStridesAndDevices, DefaultStrides) {
     EXPECT_EQ(t.GetStrides(), expected_strides);
 }
 
-TEST_P(TensorPermuteDevices, OperatorSquareBrackets) {
+TEST_P(TensorPermuteDevicesWithSYCL, OperatorSquareBrackets) {
     core::Device device = GetParam();
 
     // Zero dim
@@ -623,7 +623,7 @@ TEST_P(TensorPermuteDevices, OperatorSquareBrackets) {
     EXPECT_EQ(t_1_2_3.GetBlob(), t.GetBlob());
 }
 
-TEST_P(TensorPermuteDevices, Item) {
+TEST_P(TensorPermuteDevicesWithSYCL, Item) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -645,7 +645,7 @@ TEST_P(TensorPermuteDevices, Item) {
     EXPECT_EQ(t_1_2_3.Item<float>(), 23.f);
 }
 
-TEST_P(TensorPermuteDevices, ItemBool) {
+TEST_P(TensorPermuteDevicesWithSYCL, ItemBool) {
     core::Device device = GetParam();
 
     std::vector<bool> vals{true, true, false};
@@ -660,7 +660,7 @@ TEST_P(TensorPermuteDevices, ItemBool) {
     EXPECT_EQ(t[2].Item<bool>(), false);
 }
 
-TEST_P(TensorPermuteDevices, ItemAssign) {
+TEST_P(TensorPermuteDevicesWithSYCL, ItemAssign) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Init<float>(
             {{{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}},
@@ -782,7 +782,7 @@ TEST_P(TensorPermuteDevicePairsWithSYCL, CopyContiguous) {
               t_1_copy.GetBlob()->GetDataPtr());  // Points to beginning of Blob
 }
 
-TEST_P(TensorPermuteDevices, Slice) {
+TEST_P(TensorPermuteDevicesWithSYCL, Slice) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -839,7 +839,7 @@ TEST_P(TensorPermuteDevices, Slice) {
     EXPECT_EQ(t_5.ToFlatVector<float>(), std::vector<float>({12, 16}));
 }
 
-TEST_P(TensorPermuteDevices, GetItem) {
+TEST_P(TensorPermuteDevicesWithSYCL, GetItem) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -858,7 +858,7 @@ TEST_P(TensorPermuteDevices, GetItem) {
               std::vector<float>({12, 14, 16, 18, 20, 22}));
 }
 
-TEST_P(TensorPermuteDevices, GetItemAdvancedIndexing) {
+TEST_P(TensorPermuteDevicesWithSYCL, GetItemAdvancedIndexing) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Init<float>(
             {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
@@ -874,7 +874,7 @@ TEST_P(TensorPermuteDevices, GetItemAdvancedIndexing) {
               std::vector<float>({0, 1, 1, 2, 3, 5, 8, 13, 21}));
 }
 
-TEST_P(TensorPermuteDevices, GetItemAdvancedIndexingMixed) {
+TEST_P(TensorPermuteDevicesWithSYCL, GetItemAdvancedIndexingMixed) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -894,7 +894,7 @@ TEST_P(TensorPermuteDevices, GetItemAdvancedIndexingMixed) {
     EXPECT_EQ(t_1.ToFlatVector<float>(), std::vector<float>({13, 17, 14, 18}));
 }
 
-TEST_P(TensorPermuteDevices, SetItemAdvancedIndexing) {
+TEST_P(TensorPermuteDevicesWithSYCL, SetItemAdvancedIndexing) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -914,7 +914,7 @@ TEST_P(TensorPermuteDevices, SetItemAdvancedIndexing) {
                                   16, 17,  18, 19,  20, 21, 22, 23}));
 }
 
-TEST_P(TensorPermuteDevices, SetItemAdvancedIndexingMixed) {
+TEST_P(TensorPermuteDevicesWithSYCL, SetItemAdvancedIndexingMixed) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -937,7 +937,7 @@ TEST_P(TensorPermuteDevices, SetItemAdvancedIndexingMixed) {
                                   16, 200, 400, 19, 20, 21,  22,  23}));
 }
 
-TEST_P(TensorPermuteDevices, SliceAssign) {
+TEST_P(TensorPermuteDevicesWithSYCL, SliceAssign) {
     core::Device device = GetParam();
 
     core::Tensor dst = core::Tensor::Init<float>(
@@ -991,7 +991,7 @@ TEST_P(TensorPermuteDevices, SliceAssign) {
                                   16, 17, 18, 19, 203, 21, 223, 23}));
 }
 
-TEST_P(TensorPermuteDevices, Append) {
+TEST_P(TensorPermuteDevicesWithSYCL, Append) {
     core::Device device = GetParam();
 
     core::Tensor self, other, output;
@@ -1090,7 +1090,7 @@ TEST_P(TensorPermuteDevices, Append) {
     }
 }
 
-TEST_P(TensorPermuteDevicePairs, CopyNonContiguous) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, CopyNonContiguous) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -1127,7 +1127,7 @@ TEST_P(TensorPermuteDevicePairs, CopyNonContiguous) {
     }
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGet) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGet) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1164,7 +1164,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGet) {
     EXPECT_EQ(src_t.GetDtype(), dst_t.GetDtype());
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGetNegative) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetNegative) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1188,7 +1188,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetNegative) {
     EXPECT_EQ(t_1.ToFlatVector<float>(), std::vector<float>({5, 10, 17, 22}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndex) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGet2DBroadcastedIndex) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1218,7 +1218,8 @@ TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndex) {
                                   28, 29, 30, 31, 40, 41, 42, 43}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndexSplitBySlice) {
+TEST_P(TensorPermuteDevicePairsWithSYCL,
+       IndexGet2DBroadcastedIndexSplitBySlice) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1249,7 +1250,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndexSplitBySlice) {
                                   16, 20, 40, 44, 17, 21, 41, 45}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGetAssignToBroadcast) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetAssignToBroadcast) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -1279,7 +1280,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetAssignToBroadcast) {
             std::vector<float>({5, 10, 17, 22, 5, 10, 17, 22, 5, 10, 17, 22}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGetSeparateBySlice) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetSeparateBySlice) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1303,7 +1304,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetSeparateBySlice) {
               std::vector<float>({0, 4, 8, 13, 17, 21}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGetSliceEnd) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetSliceEnd) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1326,7 +1327,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetSliceEnd) {
               std::vector<float>({0, 1, 2, 3, 16, 17, 18, 19}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexSet) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSet) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -1405,7 +1406,7 @@ TEST_P(TensorPermuteDevicePairs, IndexSet) {
                            core::Tensor::Init<float>({10, 11}, src_device)));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexSetBroadcast) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSetBroadcast) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -1430,7 +1431,7 @@ TEST_P(TensorPermuteDevicePairs, IndexSetBroadcast) {
                                   0, 0, 0, 0, 20, 20, 20, 0, 0, 0, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, IndexAdd_) {
+TEST_P(TensorPermuteDevicesWithSYCL, IndexAdd_) {
     core::Device device = GetParam();
 
     const int tensor_size = 100;
@@ -1464,7 +1465,7 @@ TEST_P(TensorPermuteDevices, IndexAdd_) {
     }
 }
 
-TEST_P(TensorPermuteDevices, Permute) {
+TEST_P(TensorPermuteDevicesWithSYCL, Permute) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -1492,7 +1493,7 @@ TEST_P(TensorPermuteDevices, Permute) {
                                   17, 21, 14, 18, 22, 15, 19, 23}));
 }
 
-TEST_P(TensorPermuteDevices, Transpose) {
+TEST_P(TensorPermuteDevicesWithSYCL, Transpose) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -1512,7 +1513,7 @@ TEST_P(TensorPermuteDevices, Transpose) {
     EXPECT_THROW(t.Transpose(3, 5), std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, T) {
+TEST_P(TensorPermuteDevicesWithSYCL, T) {
     core::Device device = GetParam();
 
     std::vector<float> vals{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
@@ -1556,7 +1557,7 @@ TEST_P(TensorPermuteDevices, Det) {
     EXPECT_ANY_THROW(core::Tensor::Ones({3, 4}, dtype, device).Det());
 }
 
-TEST_P(TensorPermuteDevices, ShallowCopyConstructor) {
+TEST_P(TensorPermuteDevicesWithSYCL, ShallowCopyConstructor) {
     core::Device device = GetParam();
     core::Tensor t({2, 3}, core::Float32, device);
 
@@ -1579,7 +1580,7 @@ TEST_P(TensorPermuteDevices, ShallowCopyConstructor) {
     EXPECT_EQ(t.GetDataPtr(), FirstTensorDataPtr({t}));
 }
 
-TEST_P(TensorPermuteDevices, AdvancedIndexing_IsIndexSplittedBySlice) {
+TEST_P(TensorPermuteDevicesWithSYCL, AdvancedIndexing_IsIndexSplittedBySlice) {
     core::Device device = GetParam();
 
     core::Tensor idx = core::Tensor::Init<int64_t>({1, 2}, device);
@@ -1602,7 +1603,7 @@ TEST_P(TensorPermuteDevices, AdvancedIndexing_IsIndexSplittedBySlice) {
             {idx, slice, slice, idx}));
 }
 
-TEST_P(TensorPermuteDevices, Add) {
+TEST_P(TensorPermuteDevicesWithSYCL, Add) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1, 2}, {3, 4, 5}}, device);
     core::Tensor b =
@@ -1612,7 +1613,7 @@ TEST_P(TensorPermuteDevices, Add) {
               std::vector<float>({10, 12, 14, 16, 18, 20}));
 }
 
-TEST_P(TensorPermuteDevices, Add_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Add_) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1, 2}, {3, 4, 5}}, device);
     core::Tensor b =
@@ -1622,7 +1623,7 @@ TEST_P(TensorPermuteDevices, Add_) {
               std::vector<float>({10, 12, 14, 16, 18, 20}));
 }
 
-TEST_P(TensorPermuteDevices, Add_BroadcastException) {
+TEST_P(TensorPermuteDevicesWithSYCL, Add_BroadcastException) {
     // A.shape = (   3, 4)
     // B.shape = (2, 3, 4)
     // A += B should throw exception.
@@ -1642,7 +1643,7 @@ TEST_P(TensorPermuteDevices, Add_BroadcastException) {
                                   20, 22, 24, 26, 28, 30, 32, 34}));
 }
 
-TEST_P(TensorPermuteDevices, Sub) {
+TEST_P(TensorPermuteDevicesWithSYCL, Sub) {
     core::Device device = GetParam();
     core::Tensor a =
             core::Tensor::Init<float>({{10, 12, 14}, {16, 18, 20}}, device);
@@ -1652,7 +1653,7 @@ TEST_P(TensorPermuteDevices, Sub) {
               std::vector<float>({10, 11, 12, 13, 14, 15}));
 }
 
-TEST_P(TensorPermuteDevices, Sub_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Sub_) {
     core::Device device = GetParam();
     core::Tensor a =
             core::Tensor::Init<float>({{10, 12, 14}, {16, 18, 20}}, device);
@@ -1662,7 +1663,7 @@ TEST_P(TensorPermuteDevices, Sub_) {
               std::vector<float>({10, 11, 12, 13, 14, 15}));
 }
 
-TEST_P(TensorPermuteDevices, Mul) {
+TEST_P(TensorPermuteDevicesWithSYCL, Mul) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1, 2}, {3, 4, 5}}, device);
     core::Tensor b =
@@ -1672,7 +1673,7 @@ TEST_P(TensorPermuteDevices, Mul) {
               std::vector<float>({0, 7, 16, 27, 40, 55}));
 }
 
-TEST_P(TensorPermuteDevices, Mul_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Mul_) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1, 2}, {3, 4, 5}}, device);
     core::Tensor b =
@@ -1682,27 +1683,31 @@ TEST_P(TensorPermuteDevices, Mul_) {
               std::vector<float>({0, 7, 16, 27, 40, 55}));
 }
 
-TEST_P(TensorPermuteDevices, Div) {
+TEST_P(TensorPermuteDevicesWithSYCL, Div) {
     core::Device device = GetParam();
     core::Tensor a =
             core::Tensor::Init<float>({{0, 7, 16}, {27, 40, 55}}, device);
     core::Tensor b =
             core::Tensor::Init<float>({{6, 7, 8}, {9, 10, 11}}, device);
     core::Tensor c = a / b;
-    EXPECT_EQ(c.ToFlatVector<float>(), std::vector<float>({0, 1, 2, 3, 4, 5}));
+    core::Tensor c_ref = core::Tensor(std::vector<float>{0, 1, 2, 3, 4, 5},
+                                      {2, 3}, core::Float32, device);
+    EXPECT_TRUE(c.AllClose(c_ref));
 }
 
-TEST_P(TensorPermuteDevices, Div_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Div_) {
     core::Device device = GetParam();
     core::Tensor a =
             core::Tensor::Init<float>({{0, 7, 16}, {27, 40, 55}}, device);
     core::Tensor b =
             core::Tensor::Init<float>({{6, 7, 8}, {9, 10, 11}}, device);
     a /= b;
-    EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 1, 2, 3, 4, 5}));
+    core::Tensor a_ref = core::Tensor(std::vector<float>{0, 1, 2, 3, 4, 5},
+                                      {2, 3}, core::Float32, device);
+    EXPECT_TRUE(a.AllClose(a_ref));
 }
 
-TEST_P(TensorPermuteDevices, ReduceSumKeepDim) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumKeepDim) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -1774,7 +1779,7 @@ TEST_P(TensorPermuteDevices, ReduceSumKeepDim) {
     EXPECT_THROW(src.Sum({2, -1}, true), std::runtime_error);  // Repeated.
 }
 
-TEST_P(TensorPermuteDevices, ReduceSumNotKeepDim) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumNotKeepDim) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -1829,7 +1834,7 @@ TEST_P(TensorPermuteDevices, ReduceSumNotKeepDim) {
     EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({276.f}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceSumSpecialShapes) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumSpecialShapes) {
     core::Device device = GetParam();
     core::Tensor src;
     core::Tensor dst;
@@ -1908,7 +1913,7 @@ TEST_P(TensorPermuteDevices, ReduceSumSpecialShapes) {
     EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({0}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceMultipleOutputsSumLargeArray) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceMultipleOutputsSumLargeArray) {
     core::Device device = GetParam();
     core::SizeVector shape{3, 7, 8234719};
     int64_t size = shape.NumElements();
@@ -1925,7 +1930,7 @@ TEST_P(TensorPermuteDevices, ReduceMultipleOutputsSumLargeArray) {
     EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(7 * 8234719, 3));
 }
 
-TEST_P(TensorPermuteDevices, ReduceSum64bit1D) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit1D) {
     core::Device device = GetParam();
     // num_bytes = 8 * (2 ^ 28) + 1 = 2 ^ 31 + 1 ~= 2GB
     // max_offsets = num_bytes - 1 = 2 ^ 31
@@ -1943,7 +1948,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit1D) {
 }
 
 // np.sum(np.ones((2, large_dim)), dim=0)
-TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase0) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase0) {
     core::Device device = GetParam();
     int64_t large_dim = (1ULL << 27) + 10;
     core::SizeVector shape{2, large_dim};
@@ -1967,7 +1972,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase0) {
 }
 
 // np.sum(np.ones((2, large_dim)), dim=1)
-TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase1) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase1) {
     core::Device device = GetParam();
     int64_t large_dim = (1ULL << 27) + 10;
     core::SizeVector shape{2, large_dim};
@@ -1991,7 +1996,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase1) {
 }
 
 // np.sum(np.ones((large_dim, 2)), dim=0)
-TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase2) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase2) {
     core::Device device = GetParam();
     int64_t large_dim = (1ULL << 27) + 10;
     core::SizeVector shape{large_dim, 2};
@@ -2015,7 +2020,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase2) {
 }
 
 // np.sum(np.ones((large_dim, 2)), dim=1)
-TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase3) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase3) {
     core::Device device = GetParam();
     int64_t large_dim = (1ULL << 27) + 10;
     core::SizeVector shape{large_dim, 2};
@@ -2038,7 +2043,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase3) {
               std::vector<int64_t>(large_dim - 30, 2));
 }
 
-TEST_P(TensorPermuteDevices, ReduceSumLargeArray) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumLargeArray) {
     core::Device device = GetParam();
 
     std::vector<int64_t> sizes = TensorSizes::TestCases();
@@ -2060,7 +2065,7 @@ TEST_P(TensorPermuteDevices, ReduceSumLargeArray) {
     }
 }
 
-TEST_P(TensorPermuteDevices, ReduceProd) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceProd) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -2117,7 +2122,7 @@ TEST_P(TensorPermuteDevices, ReduceProd) {
     EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({0.f}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceMin) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceMin) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -2170,7 +2175,7 @@ TEST_P(TensorPermuteDevices, ReduceMin) {
     EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({0.f}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceMax) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceMax) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -2293,7 +2298,7 @@ TEST_P(TensorPermuteDevices, ReduceArgMax) {
               std::vector<int64_t>({1, 2, 2, 1, 3, 2}));
 }
 
-TEST_P(TensorPermuteDevices, Sqrt) {
+TEST_P(TensorPermuteDevicesWithSYCL, Sqrt) {
     core::Device device = GetParam();
     core::Tensor src =
             core::Tensor::Init<float>({{0, 1, 4}, {9, 16, 25}}, device);
@@ -2323,7 +2328,7 @@ TEST_P(TensorPermuteDevices, Sqrt) {
               std::vector<float>({0, 1, 2, 3, 4, 5}));
 }
 
-TEST_P(TensorPermuteDevices, Sin) {
+TEST_P(TensorPermuteDevicesWithSYCL, Sin) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2, -1, 0, 1, 2, 3};
@@ -2346,7 +2351,7 @@ TEST_P(TensorPermuteDevices, Sin) {
     EXPECT_THROW(src.Sin(), std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Cos) {
+TEST_P(TensorPermuteDevicesWithSYCL, Cos) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2, -1, 0, 1, 2, 3};
@@ -2369,7 +2374,7 @@ TEST_P(TensorPermuteDevices, Cos) {
     EXPECT_THROW(src.Cos(), std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Neg) {
+TEST_P(TensorPermuteDevicesWithSYCL, Neg) {
     core::Device device = GetParam();
 
     std::vector<float> dst_vals{2, 1, 0, -1, -2, -3};
@@ -2388,7 +2393,7 @@ TEST_P(TensorPermuteDevices, Neg) {
     EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>({1, 0, -2}));
 }
 
-TEST_P(TensorPermuteDevices, UnaryMinus) {
+TEST_P(TensorPermuteDevicesWithSYCL, UnaryMinus) {
     core::Device device = GetParam();
 
     std::vector<float> dst_vals{2, 1, 0, -1, -2, -3};
@@ -2403,7 +2408,7 @@ TEST_P(TensorPermuteDevices, UnaryMinus) {
     EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>({1, 0, -2}));
 }
 
-TEST_P(TensorPermuteDevices, Exp) {
+TEST_P(TensorPermuteDevicesWithSYCL, Exp) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2, -1, 0, 1, 2, 3};
@@ -2426,7 +2431,7 @@ TEST_P(TensorPermuteDevices, Exp) {
     EXPECT_THROW(src.Exp(), std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Abs) {
+TEST_P(TensorPermuteDevicesWithSYCL, Abs) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2, -1, 0, 1, 2, 3};
@@ -2444,7 +2449,7 @@ TEST_P(TensorPermuteDevices, Abs) {
     EXPECT_EQ(src.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, IsNan) {
+TEST_P(TensorPermuteDevicesWithSYCL, IsNan) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY};
@@ -2458,7 +2463,7 @@ TEST_P(TensorPermuteDevices, IsNan) {
     EXPECT_EQ(dst.ToFlatVector<bool>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, IsInf) {
+TEST_P(TensorPermuteDevicesWithSYCL, IsInf) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY};
@@ -2472,7 +2477,7 @@ TEST_P(TensorPermuteDevices, IsInf) {
     EXPECT_EQ(dst.ToFlatVector<bool>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, IsFinite) {
+TEST_P(TensorPermuteDevicesWithSYCL, IsFinite) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY};
@@ -2486,7 +2491,7 @@ TEST_P(TensorPermuteDevices, IsFinite) {
     EXPECT_EQ(dst.ToFlatVector<bool>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Floor) {
+TEST_P(TensorPermuteDevicesWithSYCL, Floor) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5};
@@ -2500,7 +2505,7 @@ TEST_P(TensorPermuteDevices, Floor) {
     EXPECT_EQ(dst.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Ceil) {
+TEST_P(TensorPermuteDevicesWithSYCL, Ceil) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5};
@@ -2514,7 +2519,7 @@ TEST_P(TensorPermuteDevices, Ceil) {
     EXPECT_EQ(dst.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Round) {
+TEST_P(TensorPermuteDevicesWithSYCL, Round) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5};
@@ -2528,7 +2533,7 @@ TEST_P(TensorPermuteDevices, Round) {
     EXPECT_EQ(dst.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Trunc) {
+TEST_P(TensorPermuteDevicesWithSYCL, Trunc) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5};
@@ -2542,7 +2547,7 @@ TEST_P(TensorPermuteDevices, Trunc) {
     EXPECT_EQ(dst.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, LogicalNot) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalNot) {
     core::Device device = GetParam();
 
     std::vector<bool> src_vals{true, false, true, false};
@@ -2560,7 +2565,7 @@ TEST_P(TensorPermuteDevices, LogicalNot) {
     EXPECT_EQ(src.ToFlatVector<bool>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, LogicalNotFloat) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalNotFloat) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{0, -1, 1, 2};
@@ -2584,7 +2589,7 @@ TEST_P(TensorPermuteDevices, LogicalNotFloat) {
     EXPECT_EQ(src.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, LogicalAnd) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalAnd) {
     core::Device device = GetParam();
     core::Tensor a(std::vector<bool>({true, false, true, false}), {2, 2},
                    core::Bool, device);
@@ -2603,7 +2608,7 @@ TEST_P(TensorPermuteDevices, LogicalAnd) {
               std::vector<bool>({true, false, false, false}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalAndFloat) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalAndFloat) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{-1, 0}, {1, 0}}, device);
     core::Tensor b = core::Tensor::Init<float>({{1, 0}, {0, 0}}, device);
@@ -2616,7 +2621,7 @@ TEST_P(TensorPermuteDevices, LogicalAndFloat) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 0, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalOr) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalOr) {
     core::Device device = GetParam();
     core::Tensor a(std::vector<bool>({true, false, true, false}), {2, 2},
                    core::Bool, device);
@@ -2635,7 +2640,7 @@ TEST_P(TensorPermuteDevices, LogicalOr) {
               std::vector<bool>({true, true, true, false}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalOrFloat) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalOrFloat) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{-1, 0}, {1, 0}}, device);
     core::Tensor b = core::Tensor::Init<float>({{1, -1}, {0, 0}}, device);
@@ -2648,7 +2653,7 @@ TEST_P(TensorPermuteDevices, LogicalOrFloat) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 1, 1, 0}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalXor) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalXor) {
     core::Device device = GetParam();
     core::Tensor a(std::vector<bool>({true, false, true, false}), {2, 2},
                    core::Bool, device);
@@ -2664,7 +2669,7 @@ TEST_P(TensorPermuteDevices, LogicalXor) {
               std::vector<bool>({false, true, true, false}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalXorFloat) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalXorFloat) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{-1, 0}, {1, 0}}, device);
     core::Tensor b = core::Tensor::Init<float>({{1, -1}, {0, 0}}, device);
@@ -2677,7 +2682,7 @@ TEST_P(TensorPermuteDevices, LogicalXorFloat) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 1, 1, 0}));
 }
 
-TEST_P(TensorPermuteDevices, Gt) {
+TEST_P(TensorPermuteDevicesWithSYCL, Gt) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2693,7 +2698,7 @@ TEST_P(TensorPermuteDevices, Gt) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 1, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, Lt) {
+TEST_P(TensorPermuteDevicesWithSYCL, Lt) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2709,7 +2714,7 @@ TEST_P(TensorPermuteDevices, Lt) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 0, 1, 1}));
 }
 
-TEST_P(TensorPermuteDevices, Ge) {
+TEST_P(TensorPermuteDevicesWithSYCL, Ge) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2725,7 +2730,7 @@ TEST_P(TensorPermuteDevices, Ge) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 1, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, Le) {
+TEST_P(TensorPermuteDevicesWithSYCL, Le) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2741,7 +2746,7 @@ TEST_P(TensorPermuteDevices, Le) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 0, 1, 1}));
 }
 
-TEST_P(TensorPermuteDevices, Eq) {
+TEST_P(TensorPermuteDevicesWithSYCL, Eq) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2757,7 +2762,7 @@ TEST_P(TensorPermuteDevices, Eq) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 0, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, Ne) {
+TEST_P(TensorPermuteDevicesWithSYCL, Ne) {
     core::Device device = GetParam();
 
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
@@ -2774,7 +2779,7 @@ TEST_P(TensorPermuteDevices, Ne) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 1, 1, 1}));
 }
 
-TEST_P(TensorPermuteDevices, BooleanIndex) {
+TEST_P(TensorPermuteDevicesWithSYCL, BooleanIndex) {
     core::Device device = GetParam();
 
     // a[a < 0] = 0
@@ -2798,7 +2803,7 @@ TEST_P(TensorPermuteDevices, BooleanIndex) {
     EXPECT_EQ(y.GetDtype(), core::Float32);
 }
 
-TEST_P(TensorPermuteDevices, NonZeroNumpy) {
+TEST_P(TensorPermuteDevicesWithSYCL, NonZeroNumpy) {
     core::Device device = GetParam();
 
     core::Tensor a =
@@ -2812,7 +2817,7 @@ TEST_P(TensorPermuteDevices, NonZeroNumpy) {
     EXPECT_EQ(results[1].GetShape(), core::SizeVector{3});
 }
 
-TEST_P(TensorPermuteDevices, All) {
+TEST_P(TensorPermuteDevicesWithSYCL, All) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Init<bool>(
             {{false, true}, {true, false}, {true, false}, {true, true}},
@@ -2843,7 +2848,7 @@ TEST_P(TensorPermuteDevices, All) {
     EXPECT_ANY_THROW(t.All(core::SizeVector({2})));
 }
 
-TEST_P(TensorPermuteDevices, Any) {
+TEST_P(TensorPermuteDevicesWithSYCL, Any) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Init<bool>(
             {{false, true}, {true, false}, {true, false}, {true, true}},
@@ -2875,7 +2880,7 @@ TEST_P(TensorPermuteDevices, Any) {
     EXPECT_ANY_THROW(t.Any(core::SizeVector({2})));
 }
 
-TEST_P(TensorPermuteDevices, CreationEmpty) {
+TEST_P(TensorPermuteDevicesWithSYCL, CreationEmpty) {
     core::Device device = GetParam();
 
     core::Tensor a = core::Tensor::Empty({}, core::Float32, device);
@@ -2899,7 +2904,7 @@ TEST_P(TensorPermuteDevices, CreationEmpty) {
     EXPECT_EQ(a.NumElements(), 6);
 }
 
-TEST_P(TensorPermuteDevices, CreationFull) {
+TEST_P(TensorPermuteDevicesWithSYCL, CreationFull) {
     core::Device device = GetParam();
 
     const float fill_value = 100;
@@ -2934,7 +2939,7 @@ TEST_P(TensorPermuteDevices, CreationFull) {
               std::vector<float>(a.NumElements(), fill_value));
 }
 
-TEST_P(TensorPermuteDevices, CreationZeros) {
+TEST_P(TensorPermuteDevicesWithSYCL, CreationZeros) {
     core::Device device = GetParam();
 
     core::Tensor a = core::Tensor::Zeros({2, 3}, core::Float32, device);
@@ -2943,7 +2948,7 @@ TEST_P(TensorPermuteDevices, CreationZeros) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>(a.NumElements(), 0));
 }
 
-TEST_P(TensorPermuteDevices, CreationOnes) {
+TEST_P(TensorPermuteDevicesWithSYCL, CreationOnes) {
     core::Device device = GetParam();
 
     core::Tensor a = core::Tensor::Ones({2, 3}, core::Float32, device);
@@ -2952,7 +2957,7 @@ TEST_P(TensorPermuteDevices, CreationOnes) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>(a.NumElements(), 1));
 }
 
-TEST_P(TensorPermuteDevices, ScalarOperatorOverload) {
+TEST_P(TensorPermuteDevicesWithSYCL, ScalarOperatorOverload) {
     core::Device device = GetParam();
     core::Tensor a;
     core::Tensor b;
@@ -3038,7 +3043,7 @@ TEST_P(TensorPermuteDevices, ScalarOperatorOverload) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({5, 5}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceMean) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceMean) {
     core::Device device = GetParam();
     core::Tensor src;
     core::Tensor dst;
@@ -3129,7 +3134,7 @@ TEST_P(TensorPermuteDevices, ReduceMean) {
     EXPECT_TRUE(std::isnan(dst.ToFlatVector<float>()[0]));
 }
 
-TEST_P(TensorPermuteDevices, ToDLPackFromDLPack) {
+TEST_P(TensorPermuteDevicesWithSYCL, ToDLPackFromDLPack) {
     core::Device device = GetParam();
     core::Tensor src_t = core::Tensor::Init<float>(
             {{{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}},
@@ -3162,7 +3167,7 @@ TEST_P(TensorPermuteDevices, ToDLPackFromDLPack) {
               std::vector<float>({12, 14, 20, 22}));
 }
 
-TEST_P(TensorPermuteDevices, IsSame) {
+TEST_P(TensorPermuteDevicesWithSYCL, IsSame) {
     core::Device device = GetParam();
 
     // "Shallow" copy.
@@ -3206,7 +3211,7 @@ TEST_P(TensorPermuteDevices, IsSame) {
     EXPECT_TRUE(vec[0].IsSame(vec[1]));
 }
 
-TEST_P(TensorPermuteDevices, RValueScalar) {
+TEST_P(TensorPermuteDevicesWithSYCL, RValueScalar) {
     const core::Device &device = GetParam();
     core::Tensor t, t_ref;
 
@@ -3271,7 +3276,7 @@ TEST_P(TensorPermuteDevices, RValueScalar) {
     EXPECT_TRUE(t.AllClose(t_ref));
 }
 
-TEST_P(TensorPermuteDevices, Clip) {
+TEST_P(TensorPermuteDevicesWithSYCL, Clip) {
     core::Device device = GetParam();
     core::Tensor t, t_clip, t_ref;
 
@@ -3324,7 +3329,7 @@ TEST_P(TensorPermuteDevices, Clip) {
     EXPECT_TRUE(t_clip.AllClose(t_ref));
 }
 
-TEST_P(TensorPermuteDevices, Clip_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Clip_) {
     core::Device device = GetParam();
     core::Tensor t, t_ref;
 
@@ -3414,7 +3419,7 @@ TEST_P(TensorPermuteDevicePairs, AllEqual) {
     EXPECT_FALSE(src.AllEqual(dst));
 }
 
-TEST_P(TensorPermuteDevices, Iterator) {
+TEST_P(TensorPermuteDevicesWithSYCL, Iterator) {
     core::Device device = GetParam();
 
     core::Tensor t;
@@ -3492,7 +3497,7 @@ TEST_P(TensorPermuteDevices, Iterator) {
     }
 }
 
-TEST_P(TensorPermuteDevices, ConstIterator) {
+TEST_P(TensorPermuteDevicesWithSYCL, ConstIterator) {
     core::Device device = GetParam();
 
     core::Tensor t;
@@ -3565,7 +3570,7 @@ TEST_P(TensorPermuteDevices, ConstIterator) {
     }
 }
 
-TEST_P(TensorPermuteDevices, TakeOwnership) {
+TEST_P(TensorPermuteDevicesWithSYCL, TakeOwnership) {
     core::Device device = GetParam();
     if (!device.IsCPU()) {
         GTEST_SKIP();
diff --git a/cpp/tests/core/TensorCheck.cpp b/cpp/tests/core/TensorCheck.cpp
index 8b789b481b6..2ab52141806 100644
--- a/cpp/tests/core/TensorCheck.cpp
+++ b/cpp/tests/core/TensorCheck.cpp
@@ -14,12 +14,13 @@
 namespace open3d {
 namespace tests {
 
-class TensorCheckPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(Tensor,
-                         TensorCheckPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TensorCheckPermuteDevicesWithSYCL : public PermuteDevices {};
+INSTANTIATE_TEST_SUITE_P(
+        Tensor,
+        TensorCheckPermuteDevicesWithSYCL,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
-TEST_P(TensorCheckPermuteDevices, AssertTensorDtype) {
+TEST_P(TensorCheckPermuteDevicesWithSYCL, AssertTensorDtype) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Empty({}, core::Float32, device);
 
@@ -58,7 +59,7 @@ TEST_P(TensorCheckPermuteDevices, AssertTensorDtype) {
     }
 }
 
-TEST_P(TensorCheckPermuteDevices, AssertTensorDtypes) {
+TEST_P(TensorCheckPermuteDevicesWithSYCL, AssertTensorDtypes) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Empty({}, core::Float32, device);
 
@@ -87,7 +88,7 @@ TEST_P(TensorCheckPermuteDevices, AssertTensorDtypes) {
             t, std::vector<core::Dtype>({core::Int32, core::Int64})));
 }
 
-TEST_P(TensorCheckPermuteDevices, AssertTensorDevice) {
+TEST_P(TensorCheckPermuteDevicesWithSYCL, AssertTensorDevice) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Empty({}, core::Float32, device);
 
@@ -105,7 +106,7 @@ TEST_P(TensorCheckPermuteDevices, AssertTensorDevice) {
     }
 }
 
-TEST_P(TensorCheckPermuteDevices, AssertTensorShape) {
+TEST_P(TensorCheckPermuteDevicesWithSYCL, AssertTensorShape) {
     core::Device device = GetParam();
     core::Tensor t;
 
diff --git a/cpp/tests/core/TensorFunction.cpp b/cpp/tests/core/TensorFunction.cpp
index 0d25b941dfd..8d97fe59720 100644
--- a/cpp/tests/core/TensorFunction.cpp
+++ b/cpp/tests/core/TensorFunction.cpp
@@ -15,12 +15,12 @@
 namespace open3d {
 namespace tests {
 
-class TensorFunctionPermuteDevices : public PermuteDevices {};
+class TensorFunctionPermuteDevicesWithSYCL : public PermuteDevices {};
 INSTANTIATE_TEST_SUITE_P(Tensor,
-                         TensorFunctionPermuteDevices,
+                         TensorFunctionPermuteDevicesWithSYCL,
                          testing::ValuesIn(PermuteDevices::TestCases()));
 
-TEST_P(TensorFunctionPermuteDevices, Concatenate) {
+TEST_P(TensorFunctionPermuteDevicesWithSYCL, Concatenate) {
     core::Device device = GetParam();
 
     core::Tensor a, b, c, output_tensor;
@@ -105,13 +105,13 @@ TEST_P(TensorFunctionPermuteDevices, Concatenate) {
     // Taking the above case of [1, 2] to [2, 2] with different dtype and
     // device.
     EXPECT_ANY_THROW(core::Concatenate({a, b.To(core::Float64), c}));
-    if (device.IsCUDA()) {
+    if (!device.IsCPU()) {
         EXPECT_ANY_THROW(
                 core::Concatenate({a, b.To(core::Device("CPU:0")), c}));
     }
 }
 
-TEST_P(TensorFunctionPermuteDevices, Append) {
+TEST_P(TensorFunctionPermuteDevicesWithSYCL, Append) {
     core::Device device = GetParam();
 
     core::Tensor self, other, output;
@@ -205,7 +205,7 @@ TEST_P(TensorFunctionPermuteDevices, Append) {
     // Taking the above case of [1, 2] to [2, 2] with different dtype and
     // device.
     EXPECT_ANY_THROW(core::Append(self, other.To(core::Float64)));
-    if (device.IsCUDA()) {
+    if (!device.IsCPU()) {
         EXPECT_ANY_THROW(core::Append(self, other.To(core::Device("CPU:0"))));
     }
 
@@ -215,7 +215,7 @@ TEST_P(TensorFunctionPermuteDevices, Append) {
     EXPECT_TRUE(core::Append(self, other).AllClose(self.Append(other)));
 }
 
-TEST_P(TensorFunctionPermuteDevices, Maximum) {
+TEST_P(TensorFunctionPermuteDevicesWithSYCL, Maximum) {
     core::Device device = GetParam();
 
     core::Tensor input, other, output;
@@ -268,7 +268,7 @@ TEST_P(TensorFunctionPermuteDevices, Maximum) {
             core::Tensor::Init<bool>({true, true, true, true}, device)));
 }
 
-TEST_P(TensorFunctionPermuteDevices, Minimum) {
+TEST_P(TensorFunctionPermuteDevicesWithSYCL, Minimum) {
     core::Device device = GetParam();
 
     core::Tensor input, other, output;
diff --git a/cpp/tests/core/TensorObject.cpp b/cpp/tests/core/TensorObject.cpp
index 0ff294dbc88..bfd28430b85 100644
--- a/cpp/tests/core/TensorObject.cpp
+++ b/cpp/tests/core/TensorObject.cpp
@@ -22,10 +22,11 @@
 namespace open3d {
 namespace tests {
 
-class TensorObjectPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(TensorObject,
-                         TensorObjectPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TensorObjectPermuteDevicesWithSYCL : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        TensorObject,
+        TensorObjectPermuteDevicesWithSYCL,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 class TensorObjectPermuteDevicePairs : public PermuteDevicePairs {};
 INSTANTIATE_TEST_SUITE_P(
@@ -51,7 +52,7 @@ static_assert(std::is_pod<TestObject>(), "TestObject must be a POD.");
 static const int64_t byte_size = sizeof(TestObject);
 static const std::string class_name = "TestObject";
 
-TEST_P(TensorObjectPermuteDevices, Constructor) {
+TEST_P(TensorObjectPermuteDevicesWithSYCL, Constructor) {
     core::Device device = GetParam();
     core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name);
 
@@ -70,7 +71,7 @@ TEST_P(TensorObjectPermuteDevices, Constructor) {
     EXPECT_ANY_THROW(core::Tensor({-1, -1}, dtype, device));
 }
 
-TEST_P(TensorObjectPermuteDevices, WithInitValueObject) {
+TEST_P(TensorObjectPermuteDevicesWithSYCL, WithInitValueObject) {
     core::Device device = GetParam();
     core::Dtype dtype =
             core::Dtype(core::Dtype::DtypeCode::Object, byte_size, class_name);
@@ -81,7 +82,7 @@ TEST_P(TensorObjectPermuteDevices, WithInitValueObject) {
     EXPECT_EQ(t.ToFlatVector<TestObject>(), vals);
 }
 
-TEST_P(TensorObjectPermuteDevices, FillObject) {
+TEST_P(TensorObjectPermuteDevicesWithSYCL, FillObject) {
     core::Device device = GetParam();
     core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name);
 
@@ -156,7 +157,7 @@ TEST_P(TensorObjectPermuteDevicePairs, CopyBroadcastObject) {
     EXPECT_EQ(dst_t.ToFlatVector<TestObject>(), dst_vals);
 }
 
-TEST_P(TensorObjectPermuteDevices, ItemObject) {
+TEST_P(TensorObjectPermuteDevicesWithSYCL, ItemObject) {
     core::Device device = GetParam();
     core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name);
 
@@ -172,7 +173,7 @@ TEST_P(TensorObjectPermuteDevices, ItemObject) {
     EXPECT_EQ(t[2].Item<TestObject>(), TestObject(4));
 }
 
-TEST_P(TensorObjectPermuteDevices, ItemAssignObject) {
+TEST_P(TensorObjectPermuteDevicesWithSYCL, ItemAssignObject) {
     core::Device device = GetParam();
     core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name);
 
@@ -187,7 +188,7 @@ TEST_P(TensorObjectPermuteDevices, ItemAssignObject) {
     EXPECT_EQ(t[1][2][3].Item<TestObject>(), TestObject(100));
 }
 
-TEST_P(TensorObjectPermuteDevices, IsSameObject) {
+TEST_P(TensorObjectPermuteDevicesWithSYCL, IsSameObject) {
     core::Device device = GetParam();
     core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name);
 
@@ -230,7 +231,7 @@ TEST_P(TensorObjectPermuteDevices, IsSameObject) {
     EXPECT_TRUE(vec[0].IsSame(vec[1]));
 }
 
-TEST_P(TensorObjectPermuteDevices, ConstructFromObjectTensorVector) {
+TEST_P(TensorObjectPermuteDevicesWithSYCL, ConstructFromObjectTensorVector) {
     core::Device device = GetParam();
     core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name);
 
@@ -255,7 +256,7 @@ TEST_P(TensorObjectPermuteDevices, ConstructFromObjectTensorVector) {
     EXPECT_FALSE(tl[2].IsSame(t2));
 }
 
-TEST_P(TensorObjectPermuteDevices, TensorListFromObjectTensor) {
+TEST_P(TensorObjectPermuteDevicesWithSYCL, TensorListFromObjectTensor) {
     core::Device device = GetParam();
     core::Dtype dtype(core::Dtype::DtypeCode::Object, byte_size, class_name);
 
diff --git a/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp b/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp
index 2139b40263c..d0a7189edd8 100644
--- a/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp
+++ b/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp
@@ -19,12 +19,14 @@
 namespace open3d {
 namespace tests {
 
-class AxisAlignedBoundingBoxPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(AxisAlignedBoundingBox,
-                         AxisAlignedBoundingBoxPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class AxisAlignedBoundingBoxPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        AxisAlignedBoundingBox,
+        AxisAlignedBoundingBoxPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
-class AxisAlignedBoundingBoxPermuteDevicePairs : public PermuteDevicePairs {};
+class AxisAlignedBoundingBoxPermuteDevicePairs
+    : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         AxisAlignedBoundingBox,
         AxisAlignedBoundingBoxPermuteDevicePairs,
diff --git a/cpp/tests/t/geometry/LineSet.cpp b/cpp/tests/t/geometry/LineSet.cpp
index 30c86eaa78f..fb6eefb41c1 100644
--- a/cpp/tests/t/geometry/LineSet.cpp
+++ b/cpp/tests/t/geometry/LineSet.cpp
@@ -16,12 +16,13 @@
 namespace open3d {
 namespace tests {
 
-class LineSetPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(LineSet,
-                         LineSetPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class LineSetPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        LineSet,
+        LineSetPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
-class LineSetPermuteDevicePairs : public PermuteDevicePairs {};
+class LineSetPermuteDevicePairs : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         LineSet,
         LineSetPermuteDevicePairs,
diff --git a/cpp/tests/t/geometry/OrientedBoundingBox.cpp b/cpp/tests/t/geometry/OrientedBoundingBox.cpp
index 5a730f4cfb2..8ac9cf7e737 100644
--- a/cpp/tests/t/geometry/OrientedBoundingBox.cpp
+++ b/cpp/tests/t/geometry/OrientedBoundingBox.cpp
@@ -19,12 +19,14 @@
 namespace open3d {
 namespace tests {
 
-class OrientedBoundingBoxPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(OrientedBoundingBox,
-                         OrientedBoundingBoxPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class OrientedBoundingBoxPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        OrientedBoundingBox,
+        OrientedBoundingBoxPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
-class OrientedBoundingBoxPermuteDevicePairs : public PermuteDevicePairs {};
+class OrientedBoundingBoxPermuteDevicePairs
+    : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         OrientedBoundingBox,
         OrientedBoundingBoxPermuteDevicePairs,
diff --git a/cpp/tests/t/geometry/PointCloud.cpp b/cpp/tests/t/geometry/PointCloud.cpp
index 2a74c9c75c1..604d91f7358 100644
--- a/cpp/tests/t/geometry/PointCloud.cpp
+++ b/cpp/tests/t/geometry/PointCloud.cpp
@@ -24,12 +24,13 @@
 namespace open3d {
 namespace tests {
 
-class PointCloudPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(PointCloud,
-                         PointCloudPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class PointCloudPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        PointCloud,
+        PointCloudPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
-class PointCloudPermuteDevicePairs : public PermuteDevicePairs {};
+class PointCloudPermuteDevicePairs : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         PointCloud,
         PointCloudPermuteDevicePairs,
diff --git a/cpp/tests/t/geometry/TensorMap.cpp b/cpp/tests/t/geometry/TensorMap.cpp
index edf80230073..9ae1a37e300 100644
--- a/cpp/tests/t/geometry/TensorMap.cpp
+++ b/cpp/tests/t/geometry/TensorMap.cpp
@@ -15,10 +15,11 @@
 namespace open3d {
 namespace tests {
 
-class TensorMapPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(TensorMap,
-                         TensorMapPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TensorMapPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        TensorMap,
+        TensorMapPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(TensorMapPermuteDevices, Constructor) {
     core::Dtype dtype = core::Float32;
diff --git a/cpp/tests/t/geometry/TriangleMesh.cpp b/cpp/tests/t/geometry/TriangleMesh.cpp
index 7bca9b4a55e..a7ecf816a03 100644
--- a/cpp/tests/t/geometry/TriangleMesh.cpp
+++ b/cpp/tests/t/geometry/TriangleMesh.cpp
@@ -20,10 +20,11 @@
 namespace open3d {
 namespace tests {
 
-class TriangleMeshPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(TriangleMesh,
-                         TriangleMeshPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TriangleMeshPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        TriangleMesh,
+        TriangleMeshPermuteDevices,
+        testing::ValuesIn(TriangleMeshPermuteDevices::TestCases()));
 
 TEST_P(TriangleMeshPermuteDevices, DefaultConstructor) {
     t::geometry::TriangleMesh mesh;