From 079dd3f5924ca312690e3baa9a1e116bf303f22d Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 19 Jun 2024 02:05:19 +0000
Subject: [PATCH 01/11] add sycl preset

---
 CMakePresets.json | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index e2b7a79e371bf..265843c84f032 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -11,9 +11,22 @@
             "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
         }
     },
-
+    {
+        "name": "sycl-base",
+        "hidden": true,
+        "generator": "Ninja",
+        "binaryDir": "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_CXX_COMPILER": "icx",
+            "CMAKE_C_COMPILER": "icx",
+            "LLAMA_SYCL": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
     { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
     { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
 
     {
@@ -35,15 +48,18 @@
     },
 
     { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
 
     { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
 
     { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
+    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
+    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
+    
+    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
+    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
   ]
 }

From 4488134edfd244af53c2c4ea141ca3223e8bd354 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 19 Jun 2024 07:43:51 +0000
Subject: [PATCH 02/11] fix debug link error. fix windows crash

---
 CMakeLists.txt            |    7 +-
 CMakePresets.json         |    1 -
 ggml-sycl.cpp             |    2 +-
 ggml-sycl/dpct/helper.hpp | 1603 ++++++++++++++++++-------------------
 ggml.h                    |    6 +
 5 files changed, 808 insertions(+), 811 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c90414afa92be..9cfe08d7b7d59 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -665,6 +665,7 @@ if (LLAMA_SYCL)
     #todo: AOT
 
     find_package(IntelSYCL REQUIRED)
+    find_package(MKL REQUIRED)
 
     message(STATUS "SYCL found")
 
@@ -679,11 +680,9 @@ if (LLAMA_SYCL)
     endif()
 
     add_compile_options(-I./) #include DPCT
-    add_compile_options(-I/${SYCL_INCLUDE_DIR})
 
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
     if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
     endif()
@@ -693,8 +692,10 @@ if (LLAMA_SYCL)
     list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
 
     if (WIN32)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
     else()
+        add_compile_options(-I/${SYCL_INCLUDE_DIR})
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
         if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
         elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
diff --git a/CMakePresets.json b/CMakePresets.json
index 265843c84f032..501b33073c8b8 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -19,7 +19,6 @@
         "cacheVariables": {
             "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
             "CMAKE_CXX_COMPILER": "icx",
-            "CMAKE_C_COMPILER": "icx",
             "LLAMA_SYCL": "ON",
             "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
         }
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 485f06ad331f8..e5ddf4a346c36 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -4911,7 +4911,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
     GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
     GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
 
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS01;
 
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     queue_ptr main_stream = ctx.stream();
diff --git a/ggml-sycl/dpct/helper.hpp b/ggml-sycl/dpct/helper.hpp
index 017fd6ee13268..af484d8333e59 100644
--- a/ggml-sycl/dpct/helper.hpp
+++ b/ggml-sycl/dpct/helper.hpp
@@ -58,7 +58,7 @@
 #define __dpct_noinline__ __attribute__((noinline))
 #endif
 
-inline std::string get_device_type_name(const sycl::device &Device) {
+inline std::string get_device_type_name(const sycl::device& Device) {
     auto DeviceType = Device.get_info<sycl::info::device::device_type>();
     switch (DeviceType) {
     case sycl::info::device_type::cpu:
@@ -74,39 +74,39 @@ inline std::string get_device_type_name(const sycl::device &Device) {
     }
 }
 
-inline std::string get_device_backend_and_type(const sycl::device &device) {
+inline std::string get_device_backend_and_type(const sycl::device& device) {
     std::stringstream device_type;
     sycl::backend backend = device.get_backend();
-    device_type <<  backend << ":" << get_device_type_name(device);
+    device_type << backend << ":" << get_device_type_name(device);
     return device_type.str();
 }
 
 namespace dpct
 {
-    typedef sycl::queue *queue_ptr;
-    typedef sycl::event *event_ptr;
-    typedef char *device_ptr;
+    typedef sycl::queue* queue_ptr;
+    typedef sycl::event* event_ptr;
+    typedef char* device_ptr;
     typedef uint8_t byte_t;
     typedef sycl::buffer<byte_t> buffer_t;
 
     /// SYCL default exception handler
     inline auto exception_handler = [](sycl::exception_list exceptions)
-    {
-        for (std::exception_ptr const &e : exceptions)
         {
-            try
-            {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const &e)
+            for (std::exception_ptr const& e : exceptions)
             {
-                std::cerr << "Caught asynchronous SYCL exception:" << std::endl
-                          << e.what() << std::endl
-                          << "Exception caught at file:" << __FILE__
-                          << ", line:" << __LINE__ << std::endl;
+                try
+                {
+                    std::rethrow_exception(e);
+                }
+                catch (sycl::exception const& e)
+                {
+                    std::cerr << "Caught asynchronous SYCL exception:" << std::endl
+                        << e.what() << std::endl
+                        << "Exception caught at file:" << __FILE__
+                        << ", line:" << __LINE__ << std::endl;
+                }
             }
-        }
-    };
+        };
 
     enum error_code
     {
@@ -196,7 +196,7 @@ namespace dpct
 
     namespace detail
     {
-        static void get_version(const sycl::device &dev, int &major, int &minor)
+        static void get_version(const sycl::device& dev, int& major, int& minor)
         {
             // Version string has the following format:
             // a. OpenCL<space><major.minor><space><vendor-specific-information>
@@ -206,23 +206,24 @@ namespace dpct
             ver = dev.get_info<sycl::info::device::version>();
             std::string::size_type i = 0;
             while (i < ver.size()) {
-              if (isdigit(ver[i]))
-                break;
-              i++;
+                if (isdigit(ver[i]))
+                    break;
+                i++;
             }
             major = std::stoi(&(ver[i]));
             while (i < ver.size()) {
-              if (ver[i] == '.')
-                break;
-              i++;
+                if (ver[i] == '.')
+                    break;
+                i++;
             }
             if (i < ver.size()) {
-              // a. and b.
-              i++;
-              minor = std::stoi(&(ver[i]));
-            } else {
-              // c.
-              minor = 0;
+                // a. and b.
+                i++;
+                minor = std::stoi(&(ver[i]));
+            }
+            else {
+                // c.
+                minor = 0;
             }
         }
 
@@ -231,7 +232,7 @@ namespace dpct
         {
         public:
             generic_error_type() = default;
-            generic_error_type(T value) : value{value} {}
+            generic_error_type(T value) : value{ value } {}
             operator T() const { return value; }
 
         private:
@@ -245,11 +246,11 @@ namespace dpct
     {
     public:
         pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
-        pitched_data(void *data, size_t pitch, size_t x, size_t y)
+        pitched_data(void* data, size_t pitch, size_t x, size_t y)
             : _data(data), _pitch(pitch), _x(x), _y(y) {}
 
-        void *get_data_ptr() { return _data; }
-        void set_data_ptr(void *data) { _data = data; }
+        void* get_data_ptr() { return _data; }
+        void set_data_ptr(void* data) { _data = data; }
 
         size_t get_pitch() { return _pitch; }
         void set_pitch(size_t pitch) { _pitch = pitch; }
@@ -261,7 +262,7 @@ namespace dpct
         void set_y(size_t y) { _y = y; }
 
     private:
-        void *_data;
+        void* _data;
         size_t _pitch, _x, _y;
     };
 
@@ -269,33 +270,33 @@ namespace dpct
     {
     public:
         // get interface
-        const char *get_name() const { return _name; }
-        char *get_name() { return _name; }
+        const char* get_name() const { return _name; }
+        char* get_name() { return _name; }
         template <typename WorkItemSizesTy = sycl::range<3>,
-                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                       std::is_same_v<WorkItemSizesTy, int *>,
-                                   int> = 0>
+            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+            std::is_same_v<WorkItemSizesTy, int*>,
+            int> = 0>
         auto get_max_work_item_sizes() const
         {
             if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
                 return sycl::range<3>(_max_work_item_sizes_i[0],
-                                      _max_work_item_sizes_i[1],
-                                      _max_work_item_sizes_i[2]);
+                    _max_work_item_sizes_i[1],
+                    _max_work_item_sizes_i[2]);
             else
             {
                 return _max_work_item_sizes_i;
             }
         }
         template <typename WorkItemSizesTy = sycl::range<3>,
-                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                       std::is_same_v<WorkItemSizesTy, int *>,
-                                   int> = 0>
+            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+            std::is_same_v<WorkItemSizesTy, int*>,
+            int> = 0>
         auto get_max_work_item_sizes()
         {
             if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
                 return sycl::range<3>(_max_work_item_sizes_i[0],
-                                      _max_work_item_sizes_i[1],
-                                      _max_work_item_sizes_i[2]);
+                    _max_work_item_sizes_i[1],
+                    _max_work_item_sizes_i[2]);
             else
             {
                 return _max_work_item_sizes_i;
@@ -317,24 +318,24 @@ namespace dpct
         {
             return _max_register_size_per_work_group;
         }
-        template <typename NDRangeSizeTy = size_t *,
-                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                       std::is_same_v<NDRangeSizeTy, int *>,
-                                   int> = 0>
+        template <typename NDRangeSizeTy = size_t*,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t*> ||
+            std::is_same_v<NDRangeSizeTy, int*>,
+            int> = 0>
         auto get_max_nd_range_size() const
         {
-            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t*>)
                 return _max_nd_range_size;
             else
                 return _max_nd_range_size_i;
         }
-        template <typename NDRangeSizeTy = size_t *,
-                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                       std::is_same_v<NDRangeSizeTy, int *>,
-                                   int> = 0>
+        template <typename NDRangeSizeTy = size_t*,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t*> ||
+            std::is_same_v<NDRangeSizeTy, int*>,
+            int> = 0>
         auto get_max_nd_range_size()
         {
-            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t*>)
                 return _max_nd_range_size;
             else
                 return _max_nd_range_size_i;
@@ -357,7 +358,7 @@ namespace dpct
         }
 
         // set interface
-        void set_name(const char *name)
+        void set_name(const char* name)
         {
             size_t length = strlen(name);
             if (length < 256)
@@ -376,7 +377,7 @@ namespace dpct
                 _max_work_item_sizes_i[i] = max_work_item_sizes[i];
         }
         [[deprecated]] void
-        set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
+            set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
         {
             for (int i = 0; i < 3; ++i)
             {
@@ -416,7 +417,7 @@ namespace dpct
             _max_sub_group_size = max_sub_group_size;
         }
         void
-        set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
+            set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
         {
             _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
         }
@@ -437,7 +438,7 @@ namespace dpct
             _memory_bus_width = memory_bus_width;
         }
         void
-        set_max_register_size_per_work_group(int max_register_size_per_work_group)
+            set_max_register_size_per_work_group(int max_register_size_per_work_group)
         {
             _max_register_size_per_work_group = max_register_size_per_work_group;
         }
@@ -481,21 +482,21 @@ namespace dpct
         std::array<unsigned char, 16> _uuid;
     };
 
-    static int get_major_version(const sycl::device &dev)
+    static int get_major_version(const sycl::device& dev)
     {
         int major, minor;
         detail::get_version(dev, major, minor);
         return major;
     }
 
-    static int get_minor_version(const sycl::device &dev)
+    static int get_minor_version(const sycl::device& dev)
     {
         int major, minor;
         detail::get_version(dev, major, minor);
         return minor;
     }
 
-    static void get_device_info(device_info &out, const sycl::device &dev)
+    static void get_device_info(device_info& out, const sycl::device& dev)
     {
         device_info prop;
         prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
@@ -556,17 +557,17 @@ namespace dpct
         Use 3200000 kHz as memory_clock_rate default value. \
         Use 64 bits as memory_bus_width default value.")
 #else
-#warning "get_device_info: querying memory_clock_rate and \
+        #warning "get_device_info: querying memory_clock_rate and \
         memory_bus_width are not supported by the compiler used. \
         Use 3200000 kHz as memory_clock_rate default value. \
         Use 64 bits as memory_bus_width default value."
 #endif
 
-        size_t max_sub_group_size = 1;
+            size_t max_sub_group_size = 1;
         std::vector<size_t> sub_group_sizes =
             dev.get_info<sycl::info::device::sub_group_sizes>();
 
-        for (const auto &sub_group_size : sub_group_sizes)
+        for (const auto& sub_group_size : sub_group_sizes)
         {
             if (max_sub_group_size < sub_group_size)
                 max_sub_group_size = sub_group_size;
@@ -576,7 +577,7 @@ namespace dpct
 
         prop.set_max_work_items_per_compute_unit(
             dev.get_info<sycl::info::device::max_work_group_size>());
-        int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+        int max_nd_range_size[] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
         prop.set_max_nd_range_size(max_nd_range_size);
 
         // Estimates max register size per work group, feel free to update the value
@@ -594,13 +595,13 @@ namespace dpct
         typedef std::mutex mutex_type;
 
     public:
-        device_ext() : sycl::device(), _ctx(*this) {}
+        device_ext() : sycl::device() {}
         ~device_ext()
         {
             std::lock_guard<mutex_type> lock(m_mutex);
             clear_queues();
         }
-        device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
+        device_ext(const sycl::device& base) : sycl::device(base)
         {
             std::lock_guard<mutex_type> lock(m_mutex);
             init_queues();
@@ -663,12 +664,12 @@ namespace dpct
         /// Get the number of bytes of free and total memory on the SYCL device.
         /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
         /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
-        void get_memory_info(size_t &free_memory, size_t &total_memory)
+        void get_memory_info(size_t& free_memory, size_t& total_memory)
         {
             total_memory = get_device_info().get_global_mem_size();
-            const char *warning_info = "get_memory_info: [warning] ext_intel_free_memory is not "
-                                 "supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
-                                 "use total memory as free memory";
+            const char* warning_info = "get_memory_info: [warning] ext_intel_free_memory is not "
+                "supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
+                "use total memory as free memory";
 #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
             if (!has(sycl::aspect::ext_intel_free_memory))
             {
@@ -685,12 +686,12 @@ namespace dpct
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma message("Querying the number of bytes of free memory is not supported")
 #else
-#warning "Querying the number of bytes of free memory is not supported"
+            #warning "Querying the number of bytes of free memory is not supported"
 #endif
 #endif
         }
 
-        void get_device_info(device_info &out) const
+        void get_device_info(device_info& out) const
         {
             dpct::get_device_info(out, *this);
         }
@@ -709,11 +710,11 @@ namespace dpct
             init_queues();
         }
 
-        sycl::queue &in_order_queue() { return *_q_in_order; }
+        sycl::queue& in_order_queue() { return _q_in_order; }
 
-        sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
+        sycl::queue& out_of_order_queue() { return _q_out_of_order; }
 
-        sycl::queue &default_queue()
+        sycl::queue& default_queue()
         {
             return in_order_queue();
         }
@@ -721,130 +722,120 @@ namespace dpct
         void queues_wait_and_throw()
         {
             std::unique_lock<mutex_type> lock(m_mutex);
-            std::vector<std::shared_ptr<sycl::queue>> current_queues(
-                _queues);
             lock.unlock();
-            for (const auto &q : current_queues)
+            for (auto& q : _queues)
             {
-                q->wait_and_throw();
+                q.wait_and_throw();
             }
             // Guard the destruct of current_queues to make sure the ref count is safe.
             lock.lock();
         }
 
-        sycl::queue *create_queue(bool enable_exception_handler = false)
+        sycl::queue create_queue(bool enable_exception_handler = false)
         {
             return create_in_order_queue(enable_exception_handler);
         }
 
-        sycl::queue *create_queue(sycl::context context, sycl::device device,
-                                bool enable_exception_handler = false) {
-            return create_in_order_queue(context, device, enable_exception_handler);
+        sycl::queue create_queue(sycl::device device,
+            bool enable_exception_handler = false) {
+            return create_in_order_queue(device, enable_exception_handler);
         }
 
-        sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
+        sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
             std::lock_guard<mutex_type> lock(m_mutex);
             return create_queue_impl(enable_exception_handler,
-                                    sycl::property::queue::in_order());
+                sycl::property::queue::in_order());
         }
 
-        sycl::queue *create_in_order_queue(sycl::context context, sycl::device device,
-                                        bool enable_exception_handler = false) {
+        sycl::queue create_in_order_queue(sycl::device device,
+            bool enable_exception_handler = false) {
             std::lock_guard<mutex_type> lock(m_mutex);
-            return create_queue_impl(context, device, enable_exception_handler,
-                                    sycl::property::queue::in_order());
+            return create_queue_impl(device, enable_exception_handler,
+                sycl::property::queue::in_order());
         }
 
-        sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
+        sycl::queue create_out_of_order_queue(bool enable_exception_handler = false) {
             std::lock_guard<mutex_type> lock(m_mutex);
             return create_queue_impl(enable_exception_handler);
         }
 
-        void destroy_queue(sycl::queue *&queue)
+        void destroy_queue(sycl::queue queue)
         {
             std::lock_guard<mutex_type> lock(m_mutex);
-            _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
-                                         [=](const std::shared_ptr<sycl::queue> &q) -> bool
-                                         {
-                                             return q.get() == queue;
-                                         }),
-                          _queues.end());
-            queue = nullptr;
+            _queues.clear();
         }
-        void set_saved_queue(sycl::queue *q)
+        void set_saved_queue(sycl::queue q)
         {
             std::lock_guard<mutex_type> lock(m_mutex);
             _saved_queue = q;
         }
-        sycl::queue *get_saved_queue() const
+        sycl::queue get_saved_queue() const
         {
             std::lock_guard<mutex_type> lock(m_mutex);
             return _saved_queue;
         }
-        sycl::context get_context() const { return _ctx; }
 
     private:
         void clear_queues()
         {
             _queues.clear();
-            _q_in_order = _q_out_of_order = _saved_queue = nullptr;
         }
 
         void init_queues()
         {
             _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
             _q_out_of_order = create_queue_impl(true);
-            _saved_queue = &default_queue();
+            _saved_queue = default_queue();
         }
 
         /// Caller should acquire resource \p m_mutex before calling this function.
         template <class... Properties>
-        sycl::queue *create_queue_impl(bool enable_exception_handler,
-                                       Properties... properties)
+        sycl::queue create_queue_impl(bool enable_exception_handler,
+            Properties... properties)
         {
             sycl::async_handler eh = {};
             if (enable_exception_handler)
             {
                 eh = exception_handler;
             }
-            _queues.push_back(std::make_shared<sycl::queue>(
-                _ctx, *this, eh,
+            auto q = sycl::queue(
+                *this, eh,
                 sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
                     sycl::property::queue::enable_profiling(),
 #endif
-                    properties...)));
+                    properties...));
+            _queues.push_back(q);
 
-            return _queues.back().get();
+            return _queues.back();
         }
 
         template <class... Properties>
-        sycl::queue *create_queue_impl(sycl::context context, sycl::device device,
-                                    bool enable_exception_handler,
-                                    Properties... properties) {
+        sycl::queue create_queue_impl(sycl::device device,
+            bool enable_exception_handler,
+            Properties... properties) {
             sycl::async_handler eh = {};
             if (enable_exception_handler) {
                 eh = exception_handler;
             }
-            _queues.push_back(std::make_shared<sycl::queue>(
-                context, device, eh,
+            _queues.push_back(sycl::queue(
+                device, eh,
                 sycl::property_list(
-        #ifdef DPCT_PROFILING_ENABLED
+#ifdef DPCT_PROFILING_ENABLED
                     sycl::property::queue::enable_profiling(),
-        #endif
+#endif
                     properties...)));
 
-            return _queues.back().get();
+            return _queues.back();
         }
 
-        void get_version(int &major, int &minor) const
+        void get_version(int& major, int& minor) const
         {
             detail::get_version(*this, major, minor);
         }
-        sycl::queue *_q_in_order, *_q_out_of_order;
-        sycl::queue *_saved_queue;
-        sycl::context _ctx;
-        std::vector<std::shared_ptr<sycl::queue>> _queues;
+        sycl::queue _q_in_order, _q_out_of_order;
+        sycl::queue _saved_queue;
+        std::vector<sycl::queue> _queues;
         mutable mutex_type m_mutex;
     };
 
@@ -852,13 +843,13 @@ namespace dpct
     class dev_mgr
     {
     public:
-        device_ext &current_device()
+        device_ext& current_device()
         {
             unsigned int dev_id = current_device_id();
             check_id(dev_id);
             return *_devs[dev_id];
         }
-        device_ext &cpu_device() const
+        device_ext& cpu_device() const
         {
             std::lock_guard<std::recursive_mutex> lock(m_mutex);
             if (_cpu_device == -1)
@@ -870,7 +861,7 @@ namespace dpct
                 return *_devs[_cpu_device];
             }
         }
-        device_ext &get_device(unsigned int id) const
+        device_ext& get_device(unsigned int id) const
         {
             std::lock_guard<std::recursive_mutex> lock(m_mutex);
             check_id(id);
@@ -896,7 +887,7 @@ namespace dpct
         }
         unsigned int device_count() { return _devs.size(); }
 
-        unsigned int get_device_id(const sycl::device &dev)
+        unsigned int get_device_id(const sycl::device& dev)
         {
             unsigned int id = 0;
             for (auto dev_item : _devs)
@@ -912,8 +903,8 @@ namespace dpct
 
         template <class DeviceSelector>
         std::enable_if_t<
-            std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-        select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
+            std::is_invocable_r_v<int, DeviceSelector, const sycl::device&>>
+            select_device(const DeviceSelector& selector = sycl::gpu_selector_v)
         {
             sycl::device selected_device = sycl::device(selector);
             unsigned int selected_device_id = get_device_id(selected_device);
@@ -921,32 +912,32 @@ namespace dpct
         }
 
         /// Returns the instance of device manager singleton.
-        static dev_mgr &instance()
+        static dev_mgr& instance()
         {
             static dev_mgr d_m;
             return d_m;
         }
-        dev_mgr(const dev_mgr &) = delete;
-        dev_mgr &operator=(const dev_mgr &) = delete;
-        dev_mgr(dev_mgr &&) = delete;
-        dev_mgr &operator=(dev_mgr &&) = delete;
+        dev_mgr(const dev_mgr&) = delete;
+        dev_mgr& operator=(const dev_mgr&) = delete;
+        dev_mgr(dev_mgr&&) = delete;
+        dev_mgr& operator=(dev_mgr&&) = delete;
 
     private:
         mutable std::recursive_mutex m_mutex;
-        static bool compare_dev(sycl::device &device1, sycl::device &device2)
+        static bool compare_dev(sycl::device& device1, sycl::device& device2)
         {
             sycl::backend backend1 = device1.get_backend();
             sycl::backend backend2 = device2.get_backend();
             // levelzero backends always come first
-            if(backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true;
-            if(backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false;
+            if (backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true;
+            if (backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false;
             dpct::device_info prop1;
             dpct::get_device_info(prop1, device1);
             dpct::device_info prop2;
             dpct::get_device_info(prop2, device2);
             return prop1.get_max_compute_units() > prop2.get_max_compute_units();
         }
-        static int convert_backend_index(std::string & backend) {
+        static int convert_backend_index(std::string& backend) {
             if (backend == "ext_oneapi_level_zero:gpu") return 0;
             if (backend == "opencl:gpu") return 1;
             if (backend == "ext_oneapi_cuda:gpu") return 2;
@@ -956,7 +947,7 @@ namespace dpct
             printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
             GGML_ASSERT(false);
         }
-        static bool compare_backend(std::string &backend1, std::string &backend2) {
+        static bool compare_backend(std::string& backend1, std::string& backend2) {
             return convert_backend_index(backend1) < convert_backend_index(backend2);
         }
         dev_mgr()
@@ -980,26 +971,26 @@ namespace dpct
                 Platforms.pop_back();
                 auto devices = Platform.get_devices();
                 std::string backend_type = get_device_backend_and_type(devices[0]);
-                for (const auto &device : devices) {
+                for (const auto& device : devices) {
                     backend_devices[backend_type].push_back(device);
                 }
             }
 
             std::vector<std::string> keys;
-            for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
+            for (auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
                 keys.push_back(it->first);
             }
             std::sort(keys.begin(), keys.end(), compare_backend);
 
-            for (auto &key : keys) {
+            for (auto& key : keys) {
                 std::vector<sycl::device> devs = backend_devices[key];
                 std::sort(devs.begin(), devs.end(), compare_dev);
-                for (const auto &dev : devs) {
+                for (const auto& dev : devs) {
                     sycl_all_devs.push_back(dev);
                 }
             }
 
-            for (auto &dev : sycl_all_devs)
+            for (auto& dev : sycl_all_devs)
             {
                 if (dev == default_device)
                 {
@@ -1029,7 +1020,7 @@ namespace dpct
         int _cpu_device = -1;
     };
 
-    static inline sycl::queue &get_default_queue()
+    static inline sycl::queue& get_default_queue()
     {
         return dev_mgr::instance().current_device().default_queue();
     }
@@ -1044,8 +1035,8 @@ namespace dpct
             end
         };
 
-        static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
-                                                              const void *ptr)
+        static pointer_access_attribute get_pointer_attribute(sycl::queue& q,
+            const void* ptr)
         {
             switch (sycl::get_pointer_type(ptr, q.get_context()))
             {
@@ -1063,19 +1054,19 @@ namespace dpct
         inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
         {
             static_assert((unsigned char)library_data_t::library_data_t_size <=
-                              std::numeric_limits<unsigned char>::max() &&
-                          "library_data_t size exceeds limit.");
+                std::numeric_limits<unsigned char>::max() &&
+                "library_data_t size exceeds limit.");
             static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
             return (std::uint64_t)Val;
         }
 
         template <typename FirstT, typename... RestT>
         inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
-                                                               RestT... RestVal)
+            RestT... RestVal)
         {
             static_assert((std::uint8_t)library_data_t::library_data_t_size <=
-                              std::numeric_limits<unsigned char>::max() &&
-                          "library_data_t size exceeds limit.");
+                std::numeric_limits<unsigned char>::max() &&
+                "library_data_t size exceeds limit.");
             static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
             static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
             return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
@@ -1088,10 +1079,10 @@ namespace dpct
                 // Reserved address space, no real memory allocation happens here.
 #if defined(__linux__)
                 mapped_address_space =
-                    (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
-                                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+                    (byte_t*)mmap(nullptr, mapped_region_size, PROT_NONE,
+                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 #elif defined(_WIN64)
-                mapped_address_space = (byte_t *)VirtualAlloc(
+                mapped_address_space = (byte_t*)VirtualAlloc(
                     NULL,               // NULL specified as the base address parameter
                     mapped_region_size, // Size of allocation
                     MEM_RESERVE,        // Allocate reserved pages
@@ -1108,7 +1099,7 @@ namespace dpct
             struct allocation
             {
                 buffer_t buffer;
-                byte_t *alloc_ptr;
+                byte_t* alloc_ptr;
                 size_t size;
             };
 
@@ -1123,13 +1114,13 @@ namespace dpct
 #endif
             };
 
-            mem_mgr(const mem_mgr &) = delete;
-            mem_mgr &operator=(const mem_mgr &) = delete;
-            mem_mgr(mem_mgr &&) = delete;
-            mem_mgr &operator=(mem_mgr &&) = delete;
+            mem_mgr(const mem_mgr&) = delete;
+            mem_mgr& operator=(const mem_mgr&) = delete;
+            mem_mgr(mem_mgr&&) = delete;
+            mem_mgr& operator=(mem_mgr&&) = delete;
 
             /// Allocate
-            void *mem_alloc(size_t size)
+            void* mem_alloc(size_t size)
             {
                 if (!size)
                     return nullptr;
@@ -1141,9 +1132,9 @@ namespace dpct
                 // Allocation
                 sycl::range<1> r(size);
                 buffer_t buf(r);
-                allocation A{buf, next_free, size};
+                allocation A{ buf, next_free, size };
                 // Map allocation to device pointer
-                void *result = next_free;
+                void* result = next_free;
                 m_map.emplace(next_free + size, A);
                 // Update pointer to the next free space.
                 next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
@@ -1152,7 +1143,7 @@ namespace dpct
             }
 
             /// Deallocate
-            void mem_free(const void *ptr)
+            void mem_free(const void* ptr)
             {
                 if (!ptr)
                     return;
@@ -1162,7 +1153,7 @@ namespace dpct
             }
 
             /// map: device pointer -> allocation(buffer, alloc_ptr, size)
-            allocation translate_ptr(const void *ptr)
+            allocation translate_ptr(const void* ptr)
             {
                 std::lock_guard<std::mutex> lock(m_mutex);
                 auto it = get_map_iterator(ptr);
@@ -1170,40 +1161,40 @@ namespace dpct
             }
 
             /// Check if the pointer represents device pointer or not.
-            bool is_device_ptr(const void *ptr) const
+            bool is_device_ptr(const void* ptr) const
             {
                 std::lock_guard<std::mutex> lock(m_mutex);
                 return (mapped_address_space <= ptr) &&
-                       (ptr < mapped_address_space + mapped_region_size);
+                    (ptr < mapped_address_space + mapped_region_size);
             }
 
             /// Returns the instance of memory manager singleton.
-            static mem_mgr &instance()
+            static mem_mgr& instance()
             {
                 static mem_mgr m;
                 return m;
             }
 
         private:
-            std::map<byte_t *, allocation> m_map;
+            std::map<byte_t*, allocation> m_map;
             mutable std::mutex m_mutex;
-            byte_t *mapped_address_space;
-            byte_t *next_free;
+            byte_t* mapped_address_space;
+            byte_t* next_free;
             const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
             const size_t alignment = 256;
             /// This padding may be defined to some positive value to debug
             /// out of bound accesses.
             const size_t extra_padding = 0;
 
-            std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
+            std::map<byte_t*, allocation>::iterator get_map_iterator(const void* ptr)
             {
-                auto it = m_map.upper_bound((byte_t *)ptr);
+                auto it = m_map.upper_bound((byte_t*)ptr);
                 if (it == m_map.end())
                 {
                     // Not a virtual pointer.
                     throw std::runtime_error("can not get buffer from non-virtual pointer");
                 }
-                const allocation &alloc = it->second;
+                const allocation& alloc = it->second;
                 if (ptr < alloc.alloc_ptr)
                 {
                     // Out of bound.
@@ -1225,7 +1216,7 @@ namespace dpct
                 sycl::access::target::device;
             static constexpr sycl::access_mode mode =
                 (Memory == constant) ? sycl::access_mode::read
-                                     : sycl::access_mode::read_write;
+                : sycl::access_mode::read_write;
             static constexpr size_t type_size = sizeof(T);
             using element_t =
                 typename std::conditional<Memory == constant, const T, T>::type;
@@ -1234,17 +1225,17 @@ namespace dpct
             using accessor_t = typename std::conditional<
                 Memory == local, sycl::local_accessor<value_t, Dimension>,
                 sycl::accessor<T, Dimension, mode, target>>::type;
-            using pointer_t = T *;
+            using pointer_t = T*;
         };
 
-        static inline void *dpct_malloc(size_t size, sycl::queue &q)
+        static inline void* dpct_malloc(size_t size, sycl::queue& q)
         {
             return sycl::malloc_device(size, q.get_device(), q.get_context());
         }
 
 #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
-        static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
-                                        sycl::queue &q)
+        static inline void* dpct_malloc(size_t& pitch, size_t x, size_t y, size_t z,
+            sycl::queue& q)
         {
             pitch = PITCH_DEFAULT_ALIGN(x);
             return dpct_malloc(pitch * y * z, q);
@@ -1260,8 +1251,8 @@ namespace dpct
          * @return An event representing the memset operation.
          */
         template <typename valueT>
-        static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
-                                              valueT value, size_t size)
+        static inline sycl::event dpct_memset(sycl::queue& q, void* dev_ptr,
+            valueT value, size_t size)
         {
             return q.fill(dev_ptr, value, size);
         }
@@ -1277,15 +1268,15 @@ namespace dpct
          */
         template <typename valueT>
         static inline std::vector<sycl::event>
-        dpct_memset(sycl::queue &q, pitched_data data, valueT value,
-                    sycl::range<3> size)
+            dpct_memset(sycl::queue& q, pitched_data data, valueT value,
+                sycl::range<3> size)
         {
             std::vector<sycl::event> event_list;
             size_t slice = data.get_pitch() * data.get_y();
-            unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
+            unsigned char* data_surface = (unsigned char*)data.get_data_ptr();
             for (size_t z = 0; z < size.get(2); ++z)
             {
-                unsigned char *data_ptr = data_surface;
+                unsigned char* data_ptr = data_surface;
                 for (size_t y = 0; y < size.get(1); ++y)
                 {
                     event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
@@ -1309,16 +1300,16 @@ namespace dpct
          */
         template <typename valueT>
         static inline std::vector<sycl::event>
-        dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
-                    size_t y)
+            dpct_memset(sycl::queue& q, void* ptr, size_t pitch, valueT val, size_t x,
+                size_t y)
         {
             return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
-                               sycl::range<3>(x, y, 1));
+                sycl::range<3>(x, y, 1));
         }
 
-        static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
-                                                        const void *from_ptr,
-                                                        memcpy_direction dir)
+        static memcpy_direction deduce_memcpy_direction(sycl::queue& q, void* to_ptr,
+            const void* from_ptr,
+            memcpy_direction dir)
         {
             switch (dir)
             {
@@ -1332,16 +1323,16 @@ namespace dpct
                 // table[to_attribute][from_attribute]
                 static const memcpy_direction
                     direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
-                                   [static_cast<unsigned>(pointer_access_attribute::end)] =
-                                       {{memcpy_direction::host_to_host,
-                                         memcpy_direction::device_to_host,
-                                         memcpy_direction::host_to_host},
-                                        {memcpy_direction::host_to_device,
-                                         memcpy_direction::device_to_device,
-                                         memcpy_direction::device_to_device},
-                                        {memcpy_direction::host_to_host,
-                                         memcpy_direction::device_to_device,
-                                         memcpy_direction::device_to_device}};
+                    [static_cast<unsigned>(pointer_access_attribute::end)] =
+                { {memcpy_direction::host_to_host,
+                  memcpy_direction::device_to_host,
+                  memcpy_direction::host_to_host},
+                 {memcpy_direction::host_to_device,
+                  memcpy_direction::device_to_device,
+                  memcpy_direction::device_to_device},
+                 {memcpy_direction::host_to_host,
+                  memcpy_direction::device_to_device,
+                  memcpy_direction::device_to_device} };
                 return direction_table[static_cast<unsigned>(get_pointer_attribute(
                     q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
             }
@@ -1351,9 +1342,9 @@ namespace dpct
         }
 
         static sycl::event
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-                    memcpy_direction direction,
-                    const std::vector<sycl::event> &dep_events = {})
+            dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr, size_t size,
+                memcpy_direction direction,
+                const std::vector<sycl::event>& dep_events = {})
         {
             if (!size)
                 return sycl::event{};
@@ -1363,13 +1354,13 @@ namespace dpct
 
         // Get actual copy range and make sure it will not exceed range.
         static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                            size_t pitch)
+            size_t pitch)
         {
             return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
         }
 
         static inline size_t get_offset(sycl::id<3> id, size_t slice,
-                                        size_t pitch)
+            size_t pitch)
         {
             return slice * id.get(2) + pitch * id.get(1) + id.get(0);
         }
@@ -1377,51 +1368,51 @@ namespace dpct
         /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
         /// and \p from_range to another specified by \p to_ptr and \p to_range.
         static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                    sycl::range<3> to_range, sycl::range<3> from_range,
-                    sycl::id<3> to_id, sycl::id<3> from_id,
-                    sycl::range<3> size, memcpy_direction direction,
-                    const std::vector<sycl::event> &dep_events = {})
+            dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr,
+                sycl::range<3> to_range, sycl::range<3> from_range,
+                sycl::id<3> to_id, sycl::id<3> from_id,
+                sycl::range<3> size, memcpy_direction direction,
+                const std::vector<sycl::event>& dep_events = {})
         {
             // RAII for host pointer
             class host_buffer
             {
-                void *_buf;
+                void* _buf;
                 size_t _size;
-                sycl::queue &_q;
-                const std::vector<sycl::event> &_deps; // free operation depends
+                sycl::queue& _q;
+                const std::vector<sycl::event>& _deps; // free operation depends
 
             public:
-                host_buffer(size_t size, sycl::queue &q,
-                            const std::vector<sycl::event> &deps)
+                host_buffer(size_t size, sycl::queue& q,
+                    const std::vector<sycl::event>& deps)
                     : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-                void *get_ptr() const { return _buf; }
+                void* get_ptr() const { return _buf; }
                 size_t get_size() const { return _size; }
                 ~host_buffer()
                 {
                     if (_buf)
                     {
-                        _q.submit([&](sycl::handler &cgh)
-                                  {
-        cgh.depends_on(_deps);
-        cgh.host_task([buf = _buf] { std::free(buf); }); });
+                        _q.submit([&](sycl::handler& cgh)
+                            {
+                                cgh.depends_on(_deps);
+                                cgh.host_task([buf = _buf] { std::free(buf); }); });
                     }
                 }
             };
             std::vector<sycl::event> event_list;
 
             size_t to_slice = to_range.get(1) * to_range.get(0),
-                   from_slice = from_range.get(1) * from_range.get(0);
-            unsigned char *to_surface =
-                (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-            const unsigned char *from_surface =
-                (const unsigned char *)from_ptr +
+                from_slice = from_range.get(1) * from_range.get(0);
+            unsigned char* to_surface =
+                (unsigned char*)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+            const unsigned char* from_surface =
+                (const unsigned char*)from_ptr +
                 get_offset(from_id, from_slice, from_range.get(0));
 
             if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
             {
-                return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                                    direction, dep_events)};
+                return { dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                                    direction, dep_events) };
             }
             direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
             size_t size_slice = size.get(1) * size.get(0);
@@ -1430,20 +1421,20 @@ namespace dpct
             case host_to_host:
                 for (size_t z = 0; z < size.get(2); ++z)
                 {
-                    unsigned char *to_ptr = to_surface;
-                    const unsigned char *from_ptr = from_surface;
+                    unsigned char* to_ptr = to_surface;
+                    const unsigned char* from_ptr = from_surface;
                     if (to_range.get(0) == from_range.get(0) &&
                         to_range.get(0) == size.get(0))
                     {
                         event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                                                         direction, dep_events));
+                            direction, dep_events));
                     }
                     else
                     {
                         for (size_t y = 0; y < size.get(1); ++y)
                         {
                             event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                                             direction, dep_events));
+                                direction, dep_events));
                             to_ptr += to_range.get(0);
                             from_ptr += from_range.get(0);
                         }
@@ -1455,15 +1446,15 @@ namespace dpct
             case host_to_device:
             {
                 host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                                event_list);
+                    event_list);
                 std::vector<sycl::event> host_events;
                 if (to_slice == size_slice)
                 {
                     // Copy host data to a temp host buffer with the shape of target.
                     host_events =
                         dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                                    host_to_host, dep_events);
+                            sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                            host_to_host, dep_events);
                 }
                 else
                 {
@@ -1474,39 +1465,39 @@ namespace dpct
                         // If has padding data, not sure whether it is useless. So fill temp
                         // buffer with it.
                         std::vector<sycl::event>{
-                            dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                                        device_to_host, dep_events)});
+                        dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                            device_to_host, dep_events)});
                 }
                 // Copy from temp host buffer to device with only one submit.
                 event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                                                 buf.get_size(), host_to_device,
-                                                 host_events));
+                    buf.get_size(), host_to_device,
+                    host_events));
                 break;
             }
             case device_to_host:
             {
                 host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                                event_list);
+                    event_list);
                 // Copy from host temp buffer to host target with reshaping.
                 event_list = dpct_memcpy(
                     q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
                     sycl::id<3>(0, 0, 0), size, host_to_host,
                     // Copy from device to temp host buffer with only one submit.
                     std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                                                         buf.get_size(),
-                                                         device_to_host, dep_events)});
+                        buf.get_size(),
+                        device_to_host, dep_events)});
                 break;
             }
             case device_to_device:
-                event_list.push_back(q.submit([&](sycl::handler &cgh){
-                cgh.depends_on(dep_events);
-                cgh.parallel_for<class dpct_memcpy_3d_detail>(
-                    size,
-                    [=](sycl::id<3> id) {
-                        to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                            from_surface[get_offset(id, from_slice, from_range.get(0))];
-                    }); }));
-                break;
+                event_list.push_back(q.submit([&](sycl::handler& cgh) {
+                    cgh.depends_on(dep_events);
+                    cgh.parallel_for<class dpct_memcpy_3d_detail>(
+                        size,
+                        [=](sycl::id<3> id) {
+                            to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                                from_surface[get_offset(id, from_slice, from_range.get(0))];
+                        }); }));
+                    break;
             default:
                 throw std::runtime_error("dpct_memcpy: invalid direction value");
             }
@@ -1515,26 +1506,26 @@ namespace dpct
 
         /// memcpy 2D/3D matrix specified by pitched_data.
         static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-                    pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-                    memcpy_direction direction = automatic)
+            dpct_memcpy(sycl::queue& q, pitched_data to, sycl::id<3> to_id,
+                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                memcpy_direction direction = automatic)
         {
             return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                               sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                               sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                               size, direction);
+                sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                size, direction);
         }
 
         /// memcpy 2D matrix with pitch.
         static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                    size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-                    memcpy_direction direction = automatic)
+            dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr,
+                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                memcpy_direction direction = automatic)
         {
             return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                               sycl::range<3>(from_pitch, y, 1),
-                               sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                               sycl::range<3>(x, y, 1), direction);
+                sycl::range<3>(from_pitch, y, 1),
+                sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                sycl::range<3>(x, y, 1), direction);
         }
 
         namespace deprecated
@@ -1554,9 +1545,9 @@ namespace dpct
                 using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
                 using const_void_pointer =
                     typename std::allocator_traits<Alloc>::const_void_pointer;
-                using reference = typename std::allocator_traits<Alloc>::value_type &;
+                using reference = typename std::allocator_traits<Alloc>::value_type&;
                 using const_reference =
-                    const typename std::allocator_traits<Alloc>::value_type &;
+                    const typename std::allocator_traits<Alloc>::value_type&;
                 using difference_type =
                     typename std::allocator_traits<Alloc>::difference_type;
                 using size_type = typename std::allocator_traits<Alloc>::size_type;
@@ -1577,8 +1568,8 @@ namespace dpct
 
                 usm_allocator() : _impl(dpct::get_default_queue()) {}
                 ~usm_allocator() {}
-                usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
-                usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
+                usm_allocator(const usm_allocator& other) : _impl(other._impl) {}
+                usm_allocator(usm_allocator&& other) : _impl(std::move(other._impl)) {}
                 pointer address(reference r) { return &r; }
                 const_pointer address(const_reference r) { return &r; }
                 pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
@@ -1593,14 +1584,14 @@ namespace dpct
                 {
                     return std::allocator_traits<Alloc>::max_size(_impl);
                 }
-                bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
-                bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
+                bool operator==(const usm_allocator& other) const { return _impl == other._impl; }
+                bool operator!=(const usm_allocator& other) const { return _impl != other._impl; }
             };
 
         } // namespace deprecated
 
-        inline void dpct_free(void *ptr,
-                              const sycl::queue &q)
+        inline void dpct_free(void* ptr,
+            const sycl::queue& q)
         {
             if (ptr)
             {
@@ -1609,29 +1600,29 @@ namespace dpct
         }
 
         template <typename T>
-        inline auto get_memory(const void *x)
+        inline auto get_memory(const void* x)
         {
-            T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
+            T* new_x = reinterpret_cast<T*>(const_cast<void*>(x));
             return new_x;
         }
 
         template <typename T>
-        inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
+        inline typename DataType<T>::T2 get_value(const T* s, sycl::queue& q)
         {
             using Ty = typename DataType<T>::T2;
             Ty s_h;
             if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
-                detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host)
-                    .wait();
+                detail::dpct_memcpy(q, (void*)&s_h, (const void*)s, sizeof(T), device_to_host)
+                .wait();
             else
-                s_h = *reinterpret_cast<const Ty *>(s);
+                s_h = *reinterpret_cast<const Ty*>(s);
             return s_h;
         }
 
     } // namespace detail
 
     template <typename T>
-    inline auto get_value(const T *s, sycl::queue &q)
+    inline auto get_value(const T* s, sycl::queue& q)
     {
         return detail::get_value(s, q);
     }
@@ -1639,13 +1630,13 @@ namespace dpct
     namespace detail
     {
         template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                              oneapi::mkl::transpose b_trans, int m, int n, int k,
-                              const void *alpha, const void *a, int lda, const void *b,
-                              int ldb, const void *beta, void *c, int ldc)
+        inline void gemm_impl(sycl::queue& q, oneapi::mkl::transpose a_trans,
+            oneapi::mkl::transpose b_trans, int m, int n, int k,
+            const void* alpha, const void* a, int lda, const void* b,
+            int ldb, const void* beta, void* c, int ldc)
         {
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts*>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts*>(beta), q);
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
@@ -1682,11 +1673,11 @@ namespace dpct
         };
 
         template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                                    oneapi::mkl::transpose b_trans, int m, int n, int k,
-                                    const void *alpha, const void **a, int lda,
-                                    const void **b, int ldb, const void *beta, void **c,
-                                    int ldc, int batch_size)
+        inline void gemm_batch_impl(sycl::queue& q, oneapi::mkl::transpose a_trans,
+            oneapi::mkl::transpose b_trans, int m, int n, int k,
+            const void* alpha, const void** a, int lda,
+            const void** b, int ldb, const void* beta, void** c,
+            int ldc, int batch_size)
         {
             struct matrix_info_t
             {
@@ -1697,11 +1688,11 @@ namespace dpct
                 std::int64_t groupsize_info;
             };
 
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts*>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts*>(beta), q);
 
-            matrix_info_t *matrix_info =
-                (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
+            matrix_info_t* matrix_info =
+                (matrix_info_t*)std::malloc(sizeof(matrix_info_t));
             matrix_info->transpose_info[0] = a_trans;
             matrix_info->transpose_info[1] = b_trans;
             matrix_info->value_info[0] = alpha_value;
@@ -1718,28 +1709,28 @@ namespace dpct
                 q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
                 matrix_info->size_info, matrix_info->size_info + 1,
                 matrix_info->size_info + 2, matrix_info->value_info,
-                reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
-                reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
-                matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
+                reinterpret_cast<const Ta**>(a), matrix_info->ld_info,
+                reinterpret_cast<const Tb**>(b), matrix_info->ld_info + 1,
+                matrix_info->value_info + 1, reinterpret_cast<Tc**>(c),
                 matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
 
-            q.submit([&](sycl::handler &cgh)
-                     {
-    cgh.depends_on(e);
-    cgh.host_task([=] { std::free(matrix_info); }); });
+            q.submit([&](sycl::handler& cgh)
+                {
+                    cgh.depends_on(e);
+                    cgh.host_task([=] { std::free(matrix_info); }); });
         }
 
         template <class Ta, class Tb, class Tc, class Ts>
         inline void
-        gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                        oneapi::mkl::transpose b_trans, int m, int n,
-                        int k, const void *alpha, const void *a, int lda,
-                        long long int stride_a, const void *b, int ldb,
-                        long long int stride_b, const void *beta, void *c,
-                        int ldc, long long int stride_c, int batch_size)
-        {
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+            gemm_batch_impl(sycl::queue& q, oneapi::mkl::transpose a_trans,
+                oneapi::mkl::transpose b_trans, int m, int n,
+                int k, const void* alpha, const void* a, int lda,
+                long long int stride_a, const void* b, int ldb,
+                long long int stride_b, const void* beta, void* c,
+                int ldc, long long int stride_c, int batch_size)
+        {
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts*>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts*>(beta), q);
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
@@ -1753,9 +1744,9 @@ namespace dpct
 
     template <typename VecT, class BinaryOperation>
     inline unsigned vectorized_binary(unsigned a, unsigned b,
-                                      const BinaryOperation binary_op)
+        const BinaryOperation binary_op)
     {
-        sycl::vec<unsigned, 1> v0{a}, v1{b};
+        sycl::vec<unsigned, 1> v0{ a }, v1{ b };
         auto v2 = v0.as<VecT>();
         auto v3 = v1.as<VecT>();
         auto v4 =
@@ -1764,9 +1755,9 @@ namespace dpct
         return v0;
     }
 
-    static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
-                                  memcpy_direction direction = automatic,
-                                  sycl::queue &q = dpct::get_default_queue())
+    static void async_dpct_memcpy(void* to_ptr, const void* from_ptr, size_t size,
+        memcpy_direction direction = automatic,
+        sycl::queue& q = dpct::get_default_queue())
     {
         detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
     }
@@ -1779,16 +1770,16 @@ namespace dpct
 
     template <typename T>
     T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
-                               unsigned int logical_sub_group_size = 32)
+        unsigned int logical_sub_group_size = 32)
     {
         unsigned int id = g.get_local_linear_id();
         unsigned int start_index =
             id / logical_sub_group_size * logical_sub_group_size;
         unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
         return sycl::select_from_group(g, x,
-                                       target_offset < logical_sub_group_size
-                                           ? start_index + target_offset
-                                           : id);
+            target_offset < logical_sub_group_size
+            ? start_index + target_offset
+            : id);
     }
 
     template <typename T>
@@ -1796,14 +1787,14 @@ namespace dpct
     {
         return sycl::vec<T, 1>(val)
             .template as<sycl::vec<
-                std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
+            std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
             .template convert<T>();
     }
 
     template <typename T1, typename T2>
     using dot_product_acc_t =
-        std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-                           uint32_t, int32_t>;
+        std::conditional_t<std::is_unsigned_v<T1>&& std::is_unsigned_v<T2>,
+        uint32_t, int32_t>;
 
     template <typename T1, typename T2, typename T3>
     inline auto dp4a(T1 a, T2 b, T3 c)
@@ -1830,7 +1821,7 @@ namespace dpct
     template <typename S, typename T>
     inline T vectorized_min(T a, T b)
     {
-        sycl::vec<T, 1> v0{a}, v1{b};
+        sycl::vec<T, 1> v0{ a }, v1{ b };
         auto v2 = v0.template as<S>();
         auto v3 = v1.template as<S>();
         auto v4 = sycl::min(v2, v3);
@@ -1844,13 +1835,13 @@ namespace dpct
     inline double pow(const double a, const double b) { return sycl::pow(a, b); }
     template <typename T, typename U>
     inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
-    pow(const T a, const U b)
+        pow(const T a, const U b)
     {
         return sycl::pow(a, static_cast<T>(b));
     }
     template <typename T, typename U>
     inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
-    pow(const T a, const U b)
+        pow(const T a, const U b)
     {
         return sycl::pow(static_cast<double>(a), static_cast<double>(b));
     }
@@ -1977,10 +1968,10 @@ namespace dpct
     }
 
     inline void
-    has_capability_or_fail(const sycl::device &dev,
-                           const std::initializer_list<sycl::aspect> &props)
+        has_capability_or_fail(const sycl::device& dev,
+            const std::initializer_list<sycl::aspect>& props)
     {
-        for (const auto &it : props)
+        for (const auto& it : props)
         {
             if (dev.has(it))
                 continue;
@@ -1988,13 +1979,13 @@ namespace dpct
             {
             case sycl::aspect::fp64:
                 throw std::runtime_error("'double' is not supported in '" +
-                                         dev.get_info<sycl::info::device::name>() +
-                                         "' device");
+                    dev.get_info<sycl::info::device::name>() +
+                    "' device");
                 break;
             case sycl::aspect::fp16:
                 throw std::runtime_error("'half' is not supported in '" +
-                                         dev.get_info<sycl::info::device::name>() +
-                                         "' device");
+                    dev.get_info<sycl::info::device::name>() +
+                    "' device");
                 break;
             default:
 #define __SYCL_ASPECT(ASPECT, ID) \
@@ -2003,15 +1994,15 @@ namespace dpct
 #define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
 #define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
                 auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
-                {
-                    switch (AspectNum)
                     {
+                        switch (AspectNum)
+                        {
 #include <sycl/info/aspects.def>
 #include <sycl/info/aspects_deprecated.def>
-                    default:
-                        return "unknown aspect";
-                    }
-                };
+                        default:
+                            return "unknown aspect";
+                        }
+                    };
 #undef __SYCL_ASPECT_DEPRECATED_ALIAS
 #undef __SYCL_ASPECT_DEPRECATED
 #undef __SYCL_ASPECT
@@ -2028,20 +2019,20 @@ namespace dpct
         return dev_mgr::instance().current_device_id();
     }
 
-    static inline device_ext &get_current_device()
+    static inline device_ext& get_current_device()
     {
         return dev_mgr::instance().current_device();
     }
 
-    static inline sycl::queue &get_in_order_queue()
+    static inline sycl::queue& get_in_order_queue()
     {
         return dev_mgr::instance().current_device().in_order_queue();
     }
 
     static sycl::event
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-                memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
+        dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr, size_t size,
+            memcpy_direction direction,
+            const std::vector<sycl::event>& dep_events = {})
     {
         if (!size)
             return sycl::event{};
@@ -2051,13 +2042,13 @@ namespace dpct
 
     // Get actual copy range and make sure it will not exceed range.
     static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                        size_t pitch)
+        size_t pitch)
     {
         return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
     }
 
     static inline size_t get_offset(sycl::id<3> id, size_t slice,
-                                    size_t pitch)
+        size_t pitch)
     {
         return slice * id.get(2) + pitch * id.get(1) + id.get(0);
     }
@@ -2065,51 +2056,51 @@ namespace dpct
     /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
     /// and \p from_range to another specified by \p to_ptr and \p to_range.
     static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                sycl::range<3> to_range, sycl::range<3> from_range,
-                sycl::id<3> to_id, sycl::id<3> from_id,
-                sycl::range<3> size, memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
+        dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr,
+            sycl::range<3> to_range, sycl::range<3> from_range,
+            sycl::id<3> to_id, sycl::id<3> from_id,
+            sycl::range<3> size, memcpy_direction direction,
+            const std::vector<sycl::event>& dep_events = {})
     {
         // RAII for host pointer
         class host_buffer
         {
-            void *_buf;
+            void* _buf;
             size_t _size;
-            sycl::queue &_q;
-            const std::vector<sycl::event> &_deps; // free operation depends
+            sycl::queue& _q;
+            const std::vector<sycl::event>& _deps; // free operation depends
 
         public:
-            host_buffer(size_t size, sycl::queue &q,
-                        const std::vector<sycl::event> &deps)
+            host_buffer(size_t size, sycl::queue& q,
+                const std::vector<sycl::event>& deps)
                 : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-            void *get_ptr() const { return _buf; }
+            void* get_ptr() const { return _buf; }
             size_t get_size() const { return _size; }
             ~host_buffer()
             {
                 if (_buf)
                 {
-                    _q.submit([&](sycl::handler &cgh)
-                              {
-            cgh.depends_on(_deps);
-            cgh.host_task([buf = _buf] { std::free(buf); }); });
+                    _q.submit([&](sycl::handler& cgh)
+                        {
+                            cgh.depends_on(_deps);
+                            cgh.host_task([buf = _buf] { std::free(buf); }); });
                 }
             }
         };
         std::vector<sycl::event> event_list;
 
         size_t to_slice = to_range.get(1) * to_range.get(0),
-               from_slice = from_range.get(1) * from_range.get(0);
-        unsigned char *to_surface =
-            (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-        const unsigned char *from_surface =
-            (const unsigned char *)from_ptr +
+            from_slice = from_range.get(1) * from_range.get(0);
+        unsigned char* to_surface =
+            (unsigned char*)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+        const unsigned char* from_surface =
+            (const unsigned char*)from_ptr +
             get_offset(from_id, from_slice, from_range.get(0));
 
         if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
         {
-            return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                                direction, dep_events)};
+            return { dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                                direction, dep_events) };
         }
         direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
         size_t size_slice = size.get(1) * size.get(0);
@@ -2118,20 +2109,20 @@ namespace dpct
         case host_to_host:
             for (size_t z = 0; z < size.get(2); ++z)
             {
-                unsigned char *to_ptr = to_surface;
-                const unsigned char *from_ptr = from_surface;
+                unsigned char* to_ptr = to_surface;
+                const unsigned char* from_ptr = from_surface;
                 if (to_range.get(0) == from_range.get(0) &&
                     to_range.get(0) == size.get(0))
                 {
                     event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                                                     direction, dep_events));
+                        direction, dep_events));
                 }
                 else
                 {
                     for (size_t y = 0; y < size.get(1); ++y)
                     {
                         event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                                         direction, dep_events));
+                            direction, dep_events));
                         to_ptr += to_range.get(0);
                         from_ptr += from_range.get(0);
                     }
@@ -2143,15 +2134,15 @@ namespace dpct
         case host_to_device:
         {
             host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                            event_list);
+                event_list);
             std::vector<sycl::event> host_events;
             if (to_slice == size_slice)
             {
                 // Copy host data to a temp host buffer with the shape of target.
                 host_events =
                     dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                                sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                                host_to_host, dep_events);
+                        sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                        host_to_host, dep_events);
             }
             else
             {
@@ -2162,40 +2153,40 @@ namespace dpct
                     // If has padding data, not sure whether it is useless. So fill temp
                     // buffer with it.
                     std::vector<sycl::event>{
-                        dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                                    device_to_host, dep_events)});
+                    dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                        device_to_host, dep_events)});
             }
             // Copy from temp host buffer to device with only one submit.
             event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                                             buf.get_size(), host_to_device,
-                                             host_events));
+                buf.get_size(), host_to_device,
+                host_events));
             break;
         }
         case device_to_host:
         {
             host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                            event_list);
+                event_list);
             // Copy from host temp buffer to host target with reshaping.
             event_list = dpct_memcpy(
                 q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
                 sycl::id<3>(0, 0, 0), size, host_to_host,
                 // Copy from device to temp host buffer with only one submit.
                 std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                                                     buf.get_size(),
-                                                     device_to_host, dep_events)});
+                    buf.get_size(),
+                    device_to_host, dep_events)});
             break;
         }
         case device_to_device:
-            event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                          {
-        cgh.depends_on(dep_events);
-        cgh.parallel_for<class dpct_memcpy_3d_detail>(
-            size,
-            [=](sycl::id<3> id) {
-                to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                    from_surface[get_offset(id, from_slice, from_range.get(0))];
-            }); }));
-        break;
+            event_list.push_back(q.submit([&](sycl::handler& cgh)
+                {
+                    cgh.depends_on(dep_events);
+                    cgh.parallel_for<class dpct_memcpy_3d_detail>(
+                        size,
+                        [=](sycl::id<3> id) {
+                            to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                                from_surface[get_offset(id, from_slice, from_range.get(0))];
+                        }); }));
+                    break;
         default:
             throw std::runtime_error("dpct_memcpy: invalid direction value");
         }
@@ -2204,34 +2195,34 @@ namespace dpct
 
     /// memcpy 2D/3D matrix specified by pitched_data.
     static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-                memcpy_direction direction = automatic)
+        dpct_memcpy(sycl::queue& q, pitched_data to, sycl::id<3> to_id,
+            pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+            memcpy_direction direction = automatic)
     {
         return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                           sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                           sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                           size, direction);
+            sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+            sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+            size, direction);
     }
 
     /// memcpy 2D matrix with pitch.
     static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-                memcpy_direction direction = automatic)
+        dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr,
+            size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+            memcpy_direction direction = automatic)
     {
         return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                           sycl::range<3>(from_pitch, y, 1),
-                           sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                           sycl::range<3>(x, y, 1), direction);
+            sycl::range<3>(from_pitch, y, 1),
+            sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+            sycl::range<3>(x, y, 1), direction);
     }
 
-    inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                     oneapi::mkl::transpose b_trans, int m, int n, int k,
-                     const void *alpha, const void *a, library_data_t a_type,
-                     int lda, const void *b, library_data_t b_type, int ldb,
-                     const void *beta, void *c, library_data_t c_type, int ldc,
-                     library_data_t scaling_type)
+    inline void gemm(sycl::queue& q, oneapi::mkl::transpose a_trans,
+        oneapi::mkl::transpose b_trans, int m, int n, int k,
+        const void* alpha, const void* a, library_data_t a_type,
+        int lda, const void* b, library_data_t b_type, int ldb,
+        const void* beta, void* c, library_data_t c_type, int ldc,
+        library_data_t scaling_type)
     {
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
@@ -2239,7 +2230,7 @@ namespace dpct
             scaling_type = library_data_t::complex_float;
         }
         else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
+            c_type == library_data_t::complex_double)
         {
             scaling_type = library_data_t::complex_double;
         }
@@ -2248,114 +2239,114 @@ namespace dpct
             detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
         switch (key)
         {
-        case detail::get_type_combination_id(
-            library_data_t::real_float, library_data_t::real_float,
+            case detail::get_type_combination_id(
+                library_data_t::real_float, library_data_t::real_float,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_double, library_data_t::real_double,
+            {
+                detail::gemm_impl<float, float, float, float>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_double, library_data_t::real_double,
             library_data_t::real_double, library_data_t::real_double):
-        {
-            detail::gemm_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
+            {
+                detail::gemm_impl<double, double, double, double>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::complex_float, library_data_t::complex_float,
             library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_impl<std::complex<float>, std::complex<float>,
-                              std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
+            {
+                detail::gemm_impl<std::complex<float>, std::complex<float>,
+                    std::complex<float>, std::complex<float>>(
+                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::complex_double, library_data_t::complex_double,
             library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_impl<std::complex<double>, std::complex<double>,
-                              std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
+            {
+                detail::gemm_impl<std::complex<double>, std::complex<double>,
+                    std::complex<double>, std::complex<double>>(
+                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_half):
-        {
-            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
-                                          lda, b, ldb, beta, c, ldc);
-            break;
-        }
+            {
+                detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
+                        lda, b, ldb, beta, c, ldc);
+                break;
+            }
 #ifdef __INTEL_MKL__
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            case detail::get_type_combination_id(
+                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
-                                     ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
+            {
+                detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
+                        ldb, beta, c, ldc);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
+            {
+                detail::gemm_impl<sycl::half, sycl::half, float, float>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_float):
-        {
-            float alpha_value =
-                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-            float beta_value =
-                dpct::get_value(reinterpret_cast<const float *>(beta), q);
-            sycl::half alpha_half(alpha_value);
-            sycl::half beta_half(beta_value);
-            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                              sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
-                                          a, lda, b, ldb, &beta_half, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
+            {
+                float alpha_value =
+                    dpct::get_value(reinterpret_cast<const float*>(alpha), q);
+                float beta_value =
+                    dpct::get_value(reinterpret_cast<const float*>(beta), q);
+                sycl::half alpha_half(alpha_value);
+                sycl::half beta_half(beta_value);
+                detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                    sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
+                        a, lda, b, ldb, &beta_half, c, ldc);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            {
+                detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
-        {
-            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                              oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
+            {
+                detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                    oneapi::mkl::bfloat16, float>(
+                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_int32, library_data_t::real_int32):
-        {
-            float alpha_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-            float beta_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-            detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
-                q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
-            break;
-        }
+            {
+                float alpha_float =
+                    dpct::get_value(reinterpret_cast<const std::int32_t*>(alpha), q);
+                float beta_float =
+                    dpct::get_value(reinterpret_cast<const std::int32_t*>(beta), q);
+                detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
+                    q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
+                break;
+            }
 #endif // __INTEL_MKL__
-        default:
-            throw std::runtime_error("the combination of data type is unsupported");
+            default:
+                throw std::runtime_error("the combination of data type is unsupported");
         }
     } // gemm()
 
@@ -2379,13 +2370,13 @@ namespace dpct
     /// \param [in] ldc Leading dimension of C.
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                           oneapi::mkl::transpose b_trans, int m, int n, int k,
-                           const void *alpha, const void *a[],
-                           library_data_t a_type, int lda, const void *b[],
-                           library_data_t b_type, int ldb, const void *beta,
-                           void *c[], library_data_t c_type, int ldc,
-                           int batch_size, library_data_t scaling_type)
+    inline void gemm_batch(sycl::queue& q, oneapi::mkl::transpose a_trans,
+        oneapi::mkl::transpose b_trans, int m, int n, int k,
+        const void* alpha, const void* a[],
+        library_data_t a_type, int lda, const void* b[],
+        library_data_t b_type, int ldb, const void* beta,
+        void* c[], library_data_t c_type, int ldc,
+        int batch_size, library_data_t scaling_type)
     {
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
@@ -2393,7 +2384,7 @@ namespace dpct
             scaling_type = library_data_t::complex_float;
         }
         else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
+            c_type == library_data_t::complex_double)
         {
             scaling_type = library_data_t::complex_double;
         }
@@ -2402,124 +2393,124 @@ namespace dpct
             detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
         switch (key)
         {
-        case detail::get_type_combination_id(
-            library_data_t::real_float, library_data_t::real_float,
+            case detail::get_type_combination_id(
+                library_data_t::real_float, library_data_t::real_float,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_double, library_data_t::real_double,
+            {
+                detail::gemm_batch_impl<float, float, float, float>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                    batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_double, library_data_t::real_double,
             library_data_t::real_double, library_data_t::real_double):
-        {
-            detail::gemm_batch_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
+            {
+                detail::gemm_batch_impl<double, double, double, double>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                    batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::complex_float, library_data_t::complex_float,
             library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                                    std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
+            {
+                detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                    std::complex<float>, std::complex<float>>(
+                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::complex_double, library_data_t::complex_double,
             library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                                    std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
+            {
+                detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                    std::complex<double>, std::complex<double>>(
+                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_half):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                                a, lda, b, ldb, beta, c, ldc,
-                                                batch_size);
-            break;
-        }
+            {
+                detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                        a, lda, b, ldb, beta, c, ldc,
+                        batch_size);
+                break;
+            }
 #ifdef __INTEL_MKL__
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            case detail::get_type_combination_id(
+                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                                    oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            {
+                detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                    oneapi::mkl::bfloat16, float>(
+                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                           b, ldb, beta, c, ldc, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
+            {
+                detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                        b, ldb, beta, c, ldc, batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_int32, library_data_t::real_int32):
-        {
-            float alpha_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-            float beta_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                                    float>(q, a_trans, b_trans, m, n, k, &alpha_float,
-                                           a, lda, b, ldb, &beta_float, c, ldc,
-                                           batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
+            {
+                float alpha_float =
+                    dpct::get_value(reinterpret_cast<const std::int32_t*>(alpha), q);
+                float beta_float =
+                    dpct::get_value(reinterpret_cast<const std::int32_t*>(beta), q);
+                detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                    float>(q, a_trans, b_trans, m, n, k, &alpha_float,
+                        a, lda, b, ldb, &beta_float, c, ldc,
+                        batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
+            {
+                detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                    batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
+            {
+                detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                    batch_size);
+                break;
+            }
 #endif
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
+            case detail::get_type_combination_id(
+                library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_float):
-        {
-            float alpha_value =
-                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-            float beta_value =
-                dpct::get_value(reinterpret_cast<const float *>(beta), q);
-            sycl::half alpha_half(alpha_value);
-            sycl::half beta_half(beta_value);
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
-                batch_size);
-            break;
-        }
-        default:
-            throw std::runtime_error("the combination of data type is unsupported");
+            {
+                float alpha_value =
+                    dpct::get_value(reinterpret_cast<const float*>(alpha), q);
+                float beta_value =
+                    dpct::get_value(reinterpret_cast<const float*>(beta), q);
+                sycl::half alpha_half(alpha_value);
+                sycl::half beta_half(beta_value);
+                detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                    q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
+                    batch_size);
+                break;
+            }
+            default:
+                throw std::runtime_error("the combination of data type is unsupported");
         }
     }
 
@@ -2546,14 +2537,14 @@ namespace dpct
     /// \param [in] stride_c Stride between the different C matrices.
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                           oneapi::mkl::transpose b_trans, int m, int n, int k,
-                           const void *alpha, const void *a, library_data_t a_type,
-                           int lda, long long int stride_a, const void *b,
-                           library_data_t b_type, int ldb, long long int stride_b,
-                           const void *beta, void *c, library_data_t c_type,
-                           int ldc, long long int stride_c, int batch_size,
-                           library_data_t scaling_type)
+    inline void gemm_batch(sycl::queue& q, oneapi::mkl::transpose a_trans,
+        oneapi::mkl::transpose b_trans, int m, int n, int k,
+        const void* alpha, const void* a, library_data_t a_type,
+        int lda, long long int stride_a, const void* b,
+        library_data_t b_type, int ldb, long long int stride_b,
+        const void* beta, void* c, library_data_t c_type,
+        int ldc, long long int stride_c, int batch_size,
+        library_data_t scaling_type)
     {
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
@@ -2561,7 +2552,7 @@ namespace dpct
             scaling_type = library_data_t::complex_float;
         }
         else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
+            c_type == library_data_t::complex_double)
         {
             scaling_type = library_data_t::complex_double;
         }
@@ -2570,138 +2561,138 @@ namespace dpct
             detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
         switch (key)
         {
-        case detail::get_type_combination_id(
-            library_data_t::real_float, library_data_t::real_float,
+            case detail::get_type_combination_id(
+                library_data_t::real_float, library_data_t::real_float,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_double, library_data_t::real_double,
+            {
+                detail::gemm_batch_impl<float, float, float, float>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                    beta, c, ldc, stride_c, batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_double, library_data_t::real_double,
             library_data_t::real_double, library_data_t::real_double):
-        {
-            detail::gemm_batch_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
+            {
+                detail::gemm_batch_impl<double, double, double, double>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                    beta, c, ldc, stride_c, batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::complex_float, library_data_t::complex_float,
             library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                                    std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
+            {
+                detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                    std::complex<float>, std::complex<float>>(
+                        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                        beta, c, ldc, stride_c, batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::complex_double, library_data_t::complex_double,
             library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                                    std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
+            {
+                detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                    std::complex<double>, std::complex<double>>(
+                        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                        beta, c, ldc, stride_c, batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_half):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                                a, lda, stride_a, b, ldb, stride_b,
-                                                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
+            {
+                detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                        a, lda, stride_a, b, ldb, stride_b,
+                        beta, c, ldc, stride_c, batch_size);
+                break;
+            }
 #ifdef __INTEL_MKL__
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            case detail::get_type_combination_id(
+                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                                    oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            {
+                detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                    oneapi::mkl::bfloat16, float>(
+                        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                        beta, c, ldc, stride_c, batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                           stride_a, b, ldb, stride_b, beta, c, ldc,
-                                           stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
+            {
+                detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                        stride_a, b, ldb, stride_b, beta, c, ldc,
+                        stride_c, batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_int32, library_data_t::real_int32):
-        {
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                                    std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
-                                                  a, lda, stride_a, b, ldb, stride_b,
-                                                  beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
+            {
+                detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                    std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
+                        a, lda, stride_a, b, ldb, stride_b,
+                        beta, c, ldc, stride_c, batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
+            {
+                detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                    beta, c, ldc, stride_c, batch_size);
+                break;
+            }
+            case detail::get_type_combination_id(
+                library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
+            {
+                detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                    q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                    beta, c, ldc, stride_c, batch_size);
+                break;
+            }
 #endif
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
+            case detail::get_type_combination_id(
+                library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_float):
-        {
-            float alpha_value =
-                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-            float beta_value =
-                dpct::get_value(reinterpret_cast<const float *>(beta), q);
-            sycl::half alpha_half(alpha_value);
-            sycl::half beta_half(beta_value);
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
-                &beta_half, c, ldc, stride_c, batch_size);
-            break;
-        }
-        default:
-            throw std::runtime_error("the combination of data type is unsupported");
+            {
+                float alpha_value =
+                    dpct::get_value(reinterpret_cast<const float*>(alpha), q);
+                float beta_value =
+                    dpct::get_value(reinterpret_cast<const float*>(beta), q);
+                sycl::half alpha_half(alpha_value);
+                sycl::half beta_half(beta_value);
+                detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                    q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
+                    &beta_half, c, ldc, stride_c, batch_size);
+                break;
+            }
+            default:
+                throw std::runtime_error("the combination of data type is unsupported");
         }
     }
 
     static inline void
-    async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
-                      size_t from_pitch, size_t x, size_t y,
-                      memcpy_direction direction = automatic,
-                      sycl::queue &q = get_default_queue())
+        async_dpct_memcpy(void* to_ptr, size_t to_pitch, const void* from_ptr,
+            size_t from_pitch, size_t x, size_t y,
+            memcpy_direction direction = automatic,
+            sycl::queue& q = get_default_queue())
     {
         detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
-                            direction);
+            direction);
     }
 
     using err0 = detail::generic_error_type<struct err0_tag, int>;
     using err1 = detail::generic_error_type<struct err1_tag, int>;
 
-    static inline void dpct_free(void *ptr, sycl::queue &q = get_default_queue()) {
+    static inline void dpct_free(void* ptr, sycl::queue& q = get_default_queue()) {
         detail::dpct_free(ptr, q);
     }
 
@@ -2713,12 +2704,12 @@ namespace dpct
         using element_t = typename memory_t::element_t;
         using pointer_t = typename memory_t::pointer_t;
         using accessor_t = typename memory_t::template accessor_t<3>;
-        accessor(pointer_t data, const sycl::range<3> &in_range)
+        accessor(pointer_t data, const sycl::range<3>& in_range)
             : _data(data), _range(in_range) {}
         template <memory_region M = Memory>
-        accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
+        accessor(typename std::enable_if<M != local, const accessor_t>::type& acc)
             : accessor(acc, acc.get_range()) {}
-        accessor(const accessor_t &acc, const sycl::range<3> &in_range)
+        accessor(const accessor_t& acc, const sycl::range<3>& in_range)
             : accessor(acc.get_pointer(), in_range) {}
         accessor<T, Memory, 2> operator[](size_t index) const {
             sycl::range<2> sub(_range.get(1), _range.get(2));
@@ -2737,12 +2728,12 @@ namespace dpct
         using element_t = typename memory_t::element_t;
         using pointer_t = typename memory_t::pointer_t;
         using accessor_t = typename memory_t::template accessor_t<2>;
-        accessor(pointer_t data, const sycl::range<2> &in_range)
+        accessor(pointer_t data, const sycl::range<2>& in_range)
             : _data(data), _range(in_range) {}
         template <memory_region M = Memory>
-        accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
+        accessor(typename std::enable_if<M != local, const accessor_t>::type& acc)
             : accessor(acc, acc.get_range()) {}
-        accessor(const accessor_t &acc, const sycl::range<2> &in_range)
+        accessor(const accessor_t& acc, const sycl::range<2>& in_range)
             : accessor(acc.get_pointer(), in_range) {}
 
         pointer_t operator[](size_t index) const {
@@ -2762,18 +2753,18 @@ namespace dpct
         public:
             using accessor_t =
                 typename detail::memory_traits<Memory,
-                                            T>::template accessor_t<Dimension>;
+                T>::template accessor_t<Dimension>;
             using value_t = typename detail::memory_traits<Memory, T>::value_t;
             using dpct_accessor_t = dpct::accessor<T, Memory, Dimension>;
 
             device_memory() : device_memory(sycl::range<Dimension>(1)) {}
 
             /// Constructor of 1-D array with initializer list
-            device_memory(const sycl::range<Dimension> &in_range,
-                        std::initializer_list<value_t> &&init_list)
+            device_memory(const sycl::range<Dimension>& in_range,
+                std::initializer_list<value_t>&& init_list)
                 : device_memory(in_range) {
                 assert(init_list.size() <= in_range.size());
-                _host_ptr = (value_t *)std::malloc(_size);
+                _host_ptr = (value_t*)std::malloc(_size);
                 std::memset(_host_ptr, 0, _size);
                 std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
             }
@@ -2781,23 +2772,23 @@ namespace dpct
             /// Constructor of 2-D array with initializer list
             template <size_t D = Dimension>
             device_memory(
-                const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
-                std::initializer_list<std::initializer_list<value_t>> &&init_list)
+                const typename std::enable_if<D == 2, sycl::range<2>>::type& in_range,
+                std::initializer_list<std::initializer_list<value_t>>&& init_list)
                 : device_memory(in_range) {
                 assert(init_list.size() <= in_range[0]);
-                _host_ptr = (value_t *)std::malloc(_size);
+                _host_ptr = (value_t*)std::malloc(_size);
                 std::memset(_host_ptr, 0, _size);
                 auto tmp_data = _host_ptr;
                 for (auto sub_list : init_list) {
                     assert(sub_list.size() <= in_range[1]);
                     std::memcpy(tmp_data, sub_list.begin(),
-                                sub_list.size() * sizeof(T));
+                        sub_list.size() * sizeof(T));
                     tmp_data += in_range[1];
                 }
             }
 
             /// Constructor with range
-            device_memory(const sycl::range<Dimension> &range_in)
+            device_memory(const sycl::range<Dimension>& range_in)
                 : _size(range_in.size() * sizeof(T)), _range(range_in),
                 _reference(false), _host_ptr(nullptr), _device_ptr(nullptr) {
                 static_assert(
@@ -2826,7 +2817,7 @@ namespace dpct
             void init() { init(dpct::get_default_queue()); }
             /// Allocate memory with specified queue, and init memory if has initial
             /// value.
-            void init(sycl::queue &q) {
+            void init(sycl::queue& q) {
                 if (_device_ptr)
                     return;
                 if (!_size)
@@ -2834,21 +2825,21 @@ namespace dpct
                 allocate_device(q);
                 if (_host_ptr)
                     detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size,
-                                        host_to_device);
+                        host_to_device);
             }
 
             /// The variable is assigned to a device pointer.
-            void assign(value_t *src, size_t size) {
+            void assign(value_t* src, size_t size) {
                 this->~device_memory();
                 new (this) device_memory(src, size);
             }
 
             /// Get memory pointer of the memory object, which is virtual pointer when
             /// usm is not used, and device pointer when usm is used.
-            value_t *get_ptr() { return get_ptr(get_default_queue()); }
+            value_t* get_ptr() { return get_ptr(get_default_queue()); }
             /// Get memory pointer of the memory object, which is virtual pointer when
             /// usm is not used, and device pointer when usm is used.
-            value_t *get_ptr(sycl::queue &q) {
+            value_t* get_ptr(sycl::queue& q) {
                 init(q);
                 return _device_ptr;
             }
@@ -2857,7 +2848,7 @@ namespace dpct
             size_t get_size() { return _size; }
 
             template <size_t D = Dimension>
-            typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
+            typename std::enable_if<D == 1, T>::type& operator[](size_t index) {
                 init();
                 return _device_ptr[index];
             }
@@ -2866,39 +2857,39 @@ namespace dpct
             /// when usm is used and dimension is greater than 1.
             template <size_t D = Dimension>
             typename std::enable_if<D != 1, dpct_accessor_t>::type
-            get_access([[maybe_unused]] sycl::handler &cgh) {
-                return dpct_accessor_t((T *)_device_ptr, _range);
+                get_access([[maybe_unused]] sycl::handler& cgh) {
+                return dpct_accessor_t((T*)_device_ptr, _range);
             }
 
         private:
-            device_memory(value_t *memory_ptr, size_t size)
+            device_memory(value_t* memory_ptr, size_t size)
                 : _size(size), _range(size / sizeof(T)), _reference(true),
                 _device_ptr(memory_ptr) {}
 
-            void allocate_device(sycl::queue &q) {
-        #ifndef DPCT_USM_LEVEL_NONE
+            void allocate_device(sycl::queue& q) {
+#ifndef DPCT_USM_LEVEL_NONE
                 if (Memory == shared) {
-                    _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
-                                                                q.get_context());
+                    _device_ptr = (value_t*)sycl::malloc_shared(_size, q.get_device(),
+                        q.get_context());
                     return;
                 }
-        #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
+#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
                 if (Memory == constant) {
-                    _device_ptr = (value_t *)sycl::malloc_device(
+                    _device_ptr = (value_t*)sycl::malloc_device(
                         _size, q.get_device(), q.get_context(),
                         sycl::ext::oneapi::property::usm::device_read_only());
                     return;
                 }
-        #endif
-        #endif
-                _device_ptr = (value_t *)detail::dpct_malloc(_size, q);
+#endif
+#endif
+                _device_ptr = (value_t*)detail::dpct_malloc(_size, q);
             }
 
             size_t _size;
             sycl::range<Dimension> _range;
             bool _reference;
-            value_t *_host_ptr;
-            value_t *_device_ptr;
+            value_t* _host_ptr;
+            value_t* _device_ptr;
         };
         template <class T, memory_region Memory>
         class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
@@ -2909,12 +2900,12 @@ namespace dpct
                 typename detail::memory_traits<Memory, T>::template accessor_t<0>;
 
             /// Constructor with initial value.
-            device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {}
+            device_memory(const value_t& val) : base(sycl::range<1>(1), { val }) {}
 
             /// Default constructor
             device_memory() : base(1) {}
         };
-        } // namespace detail
+    } // namespace detail
 
     template <class T, size_t Dimension>
     using global_memory = detail::device_memory<T, global, Dimension>;
@@ -2925,54 +2916,54 @@ namespace dpct
 
 
     template <typename T,
-            sycl::access::address_space addressSpace =
-                sycl::access::address_space::global_space,
-            sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-            sycl::memory_scope memoryScope = sycl::memory_scope::device>
-    inline T atomic_fetch_add(T *addr, T operand) {
-    auto atm =
-        sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-    return atm.fetch_add(operand);
+        sycl::access::address_space addressSpace =
+        sycl::access::address_space::global_space,
+        sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+        sycl::memory_scope memoryScope = sycl::memory_scope::device>
+    inline T atomic_fetch_add(T* addr, T operand) {
+        auto atm =
+            sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+        return atm.fetch_add(operand);
     }
 
     template <sycl::access::address_space addressSpace =
-                sycl::access::address_space::global_space,
-            sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-            sycl::memory_scope memoryScope = sycl::memory_scope::device,
-            typename T1, typename T2>
-    inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
-    auto atm =
-        sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-    return atm.fetch_add(operand);
+        sycl::access::address_space::global_space,
+        sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+        sycl::memory_scope memoryScope = sycl::memory_scope::device,
+        typename T1, typename T2>
+    inline T1 atomic_fetch_add(T1* addr, T2 operand) {
+        auto atm =
+            sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+        return atm.fetch_add(operand);
     }
 
     template <typename T, sycl::access::address_space addressSpace =
-                            sycl::access::address_space::global_space>
-    inline T atomic_fetch_add(T *addr, T operand,
-                            sycl::memory_order memoryOrder) {
-    switch (memoryOrder) {
+        sycl::access::address_space::global_space>
+    inline T atomic_fetch_add(T* addr, T operand,
+        sycl::memory_order memoryOrder) {
+        switch (memoryOrder) {
         case sycl::memory_order::relaxed:
             return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
-                                    sycl::memory_scope::device>(addr, operand);
+                sycl::memory_scope::device>(addr, operand);
         case sycl::memory_order::acq_rel:
             return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
-                                    sycl::memory_scope::device>(addr, operand);
+                sycl::memory_scope::device>(addr, operand);
         case sycl::memory_order::seq_cst:
             return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
-                                    sycl::memory_scope::device>(addr, operand);
+                sycl::memory_scope::device>(addr, operand);
         default:
             assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                            "atomics are: sycl::memory_order::relaxed, "
-                            "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+                "atomics are: sycl::memory_order::relaxed, "
+                "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
         }
     }
 
     template <sycl::access::address_space addressSpace =
-                sycl::access::address_space::global_space,
-            typename T1, typename T2>
-    inline T1 atomic_fetch_add(T1 *addr, T2 operand,
-                            sycl::memory_order memoryOrder) {
-    atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
+        sycl::access::address_space::global_space,
+        typename T1, typename T2>
+    inline T1 atomic_fetch_add(T1* addr, T2 operand,
+        sycl::memory_order memoryOrder) {
+        atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
     }
 
 } // COPY from DPCT head files
diff --git a/ggml.h b/ggml.h
index 13502a3622fc4..2e8fd0dbc2e31 100644
--- a/ggml.h
+++ b/ggml.h
@@ -312,6 +312,12 @@
     GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
     GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
 
+#define GGML_TENSOR_BINARY_OP_LOCALS01 \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
 #ifdef  __cplusplus
 extern "C" {
 #endif

From 5de2122647d50b34f4719ced49ab382122746ea5 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 19 Jun 2024 08:07:43 +0000
Subject: [PATCH 03/11] format

---
 CMakePresets.json         |   2 +-
 ggml-sycl/dpct/helper.hpp | 462 +++++++++++++++++++-------------------
 2 files changed, 232 insertions(+), 232 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 501b33073c8b8..fba22af9a6bab 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -57,7 +57,7 @@
     { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
     { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
     { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
-    
+
     { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
     { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
   ]
diff --git a/ggml-sycl/dpct/helper.hpp b/ggml-sycl/dpct/helper.hpp
index af484d8333e59..97ff5b39d819f 100644
--- a/ggml-sycl/dpct/helper.hpp
+++ b/ggml-sycl/dpct/helper.hpp
@@ -58,7 +58,7 @@
 #define __dpct_noinline__ __attribute__((noinline))
 #endif
 
-inline std::string get_device_type_name(const sycl::device& Device) {
+inline std::string get_device_type_name(const sycl::device &Device) {
     auto DeviceType = Device.get_info<sycl::info::device::device_type>();
     switch (DeviceType) {
     case sycl::info::device_type::cpu:
@@ -74,7 +74,7 @@ inline std::string get_device_type_name(const sycl::device& Device) {
     }
 }
 
-inline std::string get_device_backend_and_type(const sycl::device& device) {
+inline std::string get_device_backend_and_type(const sycl::device &device) {
     std::stringstream device_type;
     sycl::backend backend = device.get_backend();
     device_type << backend << ":" << get_device_type_name(device);
@@ -83,22 +83,22 @@ inline std::string get_device_backend_and_type(const sycl::device& device) {
 
 namespace dpct
 {
-    typedef sycl::queue* queue_ptr;
-    typedef sycl::event* event_ptr;
-    typedef char* device_ptr;
+    typedef sycl::queue *queue_ptr;
+    typedef sycl::event *event_ptr;
+    typedef char *device_ptr;
     typedef uint8_t byte_t;
     typedef sycl::buffer<byte_t> buffer_t;
 
     /// SYCL default exception handler
     inline auto exception_handler = [](sycl::exception_list exceptions)
         {
-            for (std::exception_ptr const& e : exceptions)
+            for (std::exception_ptr const &e : exceptions)
             {
                 try
                 {
                     std::rethrow_exception(e);
                 }
-                catch (sycl::exception const& e)
+                catch (sycl::exception const &e)
                 {
                     std::cerr << "Caught asynchronous SYCL exception:" << std::endl
                         << e.what() << std::endl
@@ -196,7 +196,7 @@ namespace dpct
 
     namespace detail
     {
-        static void get_version(const sycl::device& dev, int& major, int& minor)
+        static void get_version(const sycl::device &dev, int &major, int &minor)
         {
             // Version string has the following format:
             // a. OpenCL<space><major.minor><space><vendor-specific-information>
@@ -246,11 +246,11 @@ namespace dpct
     {
     public:
         pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
-        pitched_data(void* data, size_t pitch, size_t x, size_t y)
+        pitched_data(void *data, size_t pitch, size_t x, size_t y)
             : _data(data), _pitch(pitch), _x(x), _y(y) {}
 
-        void* get_data_ptr() { return _data; }
-        void set_data_ptr(void* data) { _data = data; }
+        void *get_data_ptr() { return _data; }
+        void set_data_ptr(void *data) { _data = data; }
 
         size_t get_pitch() { return _pitch; }
         void set_pitch(size_t pitch) { _pitch = pitch; }
@@ -262,7 +262,7 @@ namespace dpct
         void set_y(size_t y) { _y = y; }
 
     private:
-        void* _data;
+        void *_data;
         size_t _pitch, _x, _y;
     };
 
@@ -270,11 +270,11 @@ namespace dpct
     {
     public:
         // get interface
-        const char* get_name() const { return _name; }
-        char* get_name() { return _name; }
+        const char *get_name() const { return _name; }
+        char *get_name() { return _name; }
         template <typename WorkItemSizesTy = sycl::range<3>,
             std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-            std::is_same_v<WorkItemSizesTy, int*>,
+            std::is_same_v<WorkItemSizesTy, int *>,
             int> = 0>
         auto get_max_work_item_sizes() const
         {
@@ -289,7 +289,7 @@ namespace dpct
         }
         template <typename WorkItemSizesTy = sycl::range<3>,
             std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-            std::is_same_v<WorkItemSizesTy, int*>,
+            std::is_same_v<WorkItemSizesTy, int *>,
             int> = 0>
         auto get_max_work_item_sizes()
         {
@@ -318,24 +318,24 @@ namespace dpct
         {
             return _max_register_size_per_work_group;
         }
-        template <typename NDRangeSizeTy = size_t*,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t*> ||
-            std::is_same_v<NDRangeSizeTy, int*>,
+        template <typename NDRangeSizeTy = size_t *,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+            std::is_same_v<NDRangeSizeTy, int *>,
             int> = 0>
         auto get_max_nd_range_size() const
         {
-            if constexpr (std::is_same_v<NDRangeSizeTy, size_t*>)
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
                 return _max_nd_range_size;
             else
                 return _max_nd_range_size_i;
         }
-        template <typename NDRangeSizeTy = size_t*,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t*> ||
-            std::is_same_v<NDRangeSizeTy, int*>,
+        template <typename NDRangeSizeTy = size_t *,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+            std::is_same_v<NDRangeSizeTy, int *>,
             int> = 0>
         auto get_max_nd_range_size()
         {
-            if constexpr (std::is_same_v<NDRangeSizeTy, size_t*>)
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
                 return _max_nd_range_size;
             else
                 return _max_nd_range_size_i;
@@ -358,7 +358,7 @@ namespace dpct
         }
 
         // set interface
-        void set_name(const char* name)
+        void set_name(const char *name)
         {
             size_t length = strlen(name);
             if (length < 256)
@@ -482,21 +482,21 @@ namespace dpct
         std::array<unsigned char, 16> _uuid;
     };
 
-    static int get_major_version(const sycl::device& dev)
+    static int get_major_version(const sycl::device &dev)
     {
         int major, minor;
         detail::get_version(dev, major, minor);
         return major;
     }
 
-    static int get_minor_version(const sycl::device& dev)
+    static int get_minor_version(const sycl::device &dev)
     {
         int major, minor;
         detail::get_version(dev, major, minor);
         return minor;
     }
 
-    static void get_device_info(device_info& out, const sycl::device& dev)
+    static void get_device_info(device_info &out, const sycl::device &dev)
     {
         device_info prop;
         prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
@@ -567,7 +567,7 @@ namespace dpct
         std::vector<size_t> sub_group_sizes =
             dev.get_info<sycl::info::device::sub_group_sizes>();
 
-        for (const auto& sub_group_size : sub_group_sizes)
+        for (const auto &sub_group_size : sub_group_sizes)
         {
             if (max_sub_group_size < sub_group_size)
                 max_sub_group_size = sub_group_size;
@@ -601,7 +601,7 @@ namespace dpct
             std::lock_guard<mutex_type> lock(m_mutex);
             clear_queues();
         }
-        device_ext(const sycl::device& base) : sycl::device(base)
+        device_ext(const sycl::device &base) : sycl::device(base)
         {
             std::lock_guard<mutex_type> lock(m_mutex);
             init_queues();
@@ -664,10 +664,10 @@ namespace dpct
         /// Get the number of bytes of free and total memory on the SYCL device.
         /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
         /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
-        void get_memory_info(size_t& free_memory, size_t& total_memory)
+        void get_memory_info(size_t &free_memory, size_t &total_memory)
         {
             total_memory = get_device_info().get_global_mem_size();
-            const char* warning_info = "get_memory_info: [warning] ext_intel_free_memory is not "
+            const char *warning_info = "get_memory_info: [warning] ext_intel_free_memory is not "
                 "supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
                 "use total memory as free memory";
 #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
@@ -691,7 +691,7 @@ namespace dpct
 #endif
         }
 
-        void get_device_info(device_info& out) const
+        void get_device_info(device_info &out) const
         {
             dpct::get_device_info(out, *this);
         }
@@ -710,11 +710,11 @@ namespace dpct
             init_queues();
         }
 
-        sycl::queue& in_order_queue() { return _q_in_order; }
+        sycl::queue &in_order_queue() { return _q_in_order; }
 
-        sycl::queue& out_of_order_queue() { return _q_out_of_order; }
+        sycl::queue &out_of_order_queue() { return _q_out_of_order; }
 
-        sycl::queue& default_queue()
+        sycl::queue &default_queue()
         {
             return in_order_queue();
         }
@@ -723,7 +723,7 @@ namespace dpct
         {
             std::unique_lock<mutex_type> lock(m_mutex);
             lock.unlock();
-            for (auto& q : _queues)
+            for (auto &q : _queues)
             {
                 q.wait_and_throw();
             }
@@ -829,7 +829,7 @@ namespace dpct
             return _queues.back();
         }
 
-        void get_version(int& major, int& minor) const
+        void get_version(int &major, int &minor) const
         {
             detail::get_version(*this, major, minor);
         }
@@ -843,13 +843,13 @@ namespace dpct
     class dev_mgr
     {
     public:
-        device_ext& current_device()
+        device_ext &current_device()
         {
             unsigned int dev_id = current_device_id();
             check_id(dev_id);
             return *_devs[dev_id];
         }
-        device_ext& cpu_device() const
+        device_ext &cpu_device() const
         {
             std::lock_guard<std::recursive_mutex> lock(m_mutex);
             if (_cpu_device == -1)
@@ -861,7 +861,7 @@ namespace dpct
                 return *_devs[_cpu_device];
             }
         }
-        device_ext& get_device(unsigned int id) const
+        device_ext &get_device(unsigned int id) const
         {
             std::lock_guard<std::recursive_mutex> lock(m_mutex);
             check_id(id);
@@ -887,7 +887,7 @@ namespace dpct
         }
         unsigned int device_count() { return _devs.size(); }
 
-        unsigned int get_device_id(const sycl::device& dev)
+        unsigned int get_device_id(const sycl::device &dev)
         {
             unsigned int id = 0;
             for (auto dev_item : _devs)
@@ -903,8 +903,8 @@ namespace dpct
 
         template <class DeviceSelector>
         std::enable_if_t<
-            std::is_invocable_r_v<int, DeviceSelector, const sycl::device&>>
-            select_device(const DeviceSelector& selector = sycl::gpu_selector_v)
+            std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
+            select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
         {
             sycl::device selected_device = sycl::device(selector);
             unsigned int selected_device_id = get_device_id(selected_device);
@@ -912,19 +912,19 @@ namespace dpct
         }
 
         /// Returns the instance of device manager singleton.
-        static dev_mgr& instance()
+        static dev_mgr &instance()
         {
             static dev_mgr d_m;
             return d_m;
         }
-        dev_mgr(const dev_mgr&) = delete;
-        dev_mgr& operator=(const dev_mgr&) = delete;
-        dev_mgr(dev_mgr&&) = delete;
-        dev_mgr& operator=(dev_mgr&&) = delete;
+        dev_mgr(const dev_mgr &) = delete;
+        dev_mgr &operator=(const dev_mgr &) = delete;
+        dev_mgr(dev_mgr &&) = delete;
+        dev_mgr &operator=(dev_mgr &&) = delete;
 
     private:
         mutable std::recursive_mutex m_mutex;
-        static bool compare_dev(sycl::device& device1, sycl::device& device2)
+        static bool compare_dev(sycl::device &device1, sycl::device &device2)
         {
             sycl::backend backend1 = device1.get_backend();
             sycl::backend backend2 = device2.get_backend();
@@ -937,7 +937,7 @@ namespace dpct
             dpct::get_device_info(prop2, device2);
             return prop1.get_max_compute_units() > prop2.get_max_compute_units();
         }
-        static int convert_backend_index(std::string& backend) {
+        static int convert_backend_index(std::string &backend) {
             if (backend == "ext_oneapi_level_zero:gpu") return 0;
             if (backend == "opencl:gpu") return 1;
             if (backend == "ext_oneapi_cuda:gpu") return 2;
@@ -947,7 +947,7 @@ namespace dpct
             printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
             GGML_ASSERT(false);
         }
-        static bool compare_backend(std::string& backend1, std::string& backend2) {
+        static bool compare_backend(std::string &backend1, std::string &backend2) {
             return convert_backend_index(backend1) < convert_backend_index(backend2);
         }
         dev_mgr()
@@ -971,7 +971,7 @@ namespace dpct
                 Platforms.pop_back();
                 auto devices = Platform.get_devices();
                 std::string backend_type = get_device_backend_and_type(devices[0]);
-                for (const auto& device : devices) {
+                for (const auto &device : devices) {
                     backend_devices[backend_type].push_back(device);
                 }
             }
@@ -982,15 +982,15 @@ namespace dpct
             }
             std::sort(keys.begin(), keys.end(), compare_backend);
 
-            for (auto& key : keys) {
+            for (auto &key : keys) {
                 std::vector<sycl::device> devs = backend_devices[key];
                 std::sort(devs.begin(), devs.end(), compare_dev);
-                for (const auto& dev : devs) {
+                for (const auto &dev : devs) {
                     sycl_all_devs.push_back(dev);
                 }
             }
 
-            for (auto& dev : sycl_all_devs)
+            for (auto &dev : sycl_all_devs)
             {
                 if (dev == default_device)
                 {
@@ -1020,7 +1020,7 @@ namespace dpct
         int _cpu_device = -1;
     };
 
-    static inline sycl::queue& get_default_queue()
+    static inline sycl::queue &get_default_queue()
     {
         return dev_mgr::instance().current_device().default_queue();
     }
@@ -1035,8 +1035,8 @@ namespace dpct
             end
         };
 
-        static pointer_access_attribute get_pointer_attribute(sycl::queue& q,
-            const void* ptr)
+        static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
+            const void *ptr)
         {
             switch (sycl::get_pointer_type(ptr, q.get_context()))
             {
@@ -1079,10 +1079,10 @@ namespace dpct
                 // Reserved address space, no real memory allocation happens here.
 #if defined(__linux__)
                 mapped_address_space =
-                    (byte_t*)mmap(nullptr, mapped_region_size, PROT_NONE,
+                    (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 #elif defined(_WIN64)
-                mapped_address_space = (byte_t*)VirtualAlloc(
+                mapped_address_space = (byte_t *)VirtualAlloc(
                     NULL,               // NULL specified as the base address parameter
                     mapped_region_size, // Size of allocation
                     MEM_RESERVE,        // Allocate reserved pages
@@ -1099,7 +1099,7 @@ namespace dpct
             struct allocation
             {
                 buffer_t buffer;
-                byte_t* alloc_ptr;
+                byte_t *alloc_ptr;
                 size_t size;
             };
 
@@ -1114,13 +1114,13 @@ namespace dpct
 #endif
             };
 
-            mem_mgr(const mem_mgr&) = delete;
-            mem_mgr& operator=(const mem_mgr&) = delete;
-            mem_mgr(mem_mgr&&) = delete;
-            mem_mgr& operator=(mem_mgr&&) = delete;
+            mem_mgr(const mem_mgr &) = delete;
+            mem_mgr &operator=(const mem_mgr &) = delete;
+            mem_mgr(mem_mgr &&) = delete;
+            mem_mgr &operator=(mem_mgr &&) = delete;
 
             /// Allocate
-            void* mem_alloc(size_t size)
+            void *mem_alloc(size_t size)
             {
                 if (!size)
                     return nullptr;
@@ -1134,7 +1134,7 @@ namespace dpct
                 buffer_t buf(r);
                 allocation A{ buf, next_free, size };
                 // Map allocation to device pointer
-                void* result = next_free;
+                void *result = next_free;
                 m_map.emplace(next_free + size, A);
                 // Update pointer to the next free space.
                 next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
@@ -1143,7 +1143,7 @@ namespace dpct
             }
 
             /// Deallocate
-            void mem_free(const void* ptr)
+            void mem_free(const void *ptr)
             {
                 if (!ptr)
                     return;
@@ -1153,7 +1153,7 @@ namespace dpct
             }
 
             /// map: device pointer -> allocation(buffer, alloc_ptr, size)
-            allocation translate_ptr(const void* ptr)
+            allocation translate_ptr(const void *ptr)
             {
                 std::lock_guard<std::mutex> lock(m_mutex);
                 auto it = get_map_iterator(ptr);
@@ -1161,7 +1161,7 @@ namespace dpct
             }
 
             /// Check if the pointer represents device pointer or not.
-            bool is_device_ptr(const void* ptr) const
+            bool is_device_ptr(const void *ptr) const
             {
                 std::lock_guard<std::mutex> lock(m_mutex);
                 return (mapped_address_space <= ptr) &&
@@ -1169,32 +1169,32 @@ namespace dpct
             }
 
             /// Returns the instance of memory manager singleton.
-            static mem_mgr& instance()
+            static mem_mgr &instance()
             {
                 static mem_mgr m;
                 return m;
             }
 
         private:
-            std::map<byte_t*, allocation> m_map;
+            std::map<byte_t *, allocation> m_map;
             mutable std::mutex m_mutex;
-            byte_t* mapped_address_space;
-            byte_t* next_free;
+            byte_t *mapped_address_space;
+            byte_t *next_free;
             const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
             const size_t alignment = 256;
             /// This padding may be defined to some positive value to debug
             /// out of bound accesses.
             const size_t extra_padding = 0;
 
-            std::map<byte_t*, allocation>::iterator get_map_iterator(const void* ptr)
+            std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
             {
-                auto it = m_map.upper_bound((byte_t*)ptr);
+                auto it = m_map.upper_bound((byte_t *)ptr);
                 if (it == m_map.end())
                 {
                     // Not a virtual pointer.
                     throw std::runtime_error("can not get buffer from non-virtual pointer");
                 }
-                const allocation& alloc = it->second;
+                const allocation &alloc = it->second;
                 if (ptr < alloc.alloc_ptr)
                 {
                     // Out of bound.
@@ -1225,17 +1225,17 @@ namespace dpct
             using accessor_t = typename std::conditional<
                 Memory == local, sycl::local_accessor<value_t, Dimension>,
                 sycl::accessor<T, Dimension, mode, target>>::type;
-            using pointer_t = T*;
+            using pointer_t = T *;
         };
 
-        static inline void* dpct_malloc(size_t size, sycl::queue& q)
+        static inline void *dpct_malloc(size_t size, sycl::queue &q)
         {
             return sycl::malloc_device(size, q.get_device(), q.get_context());
         }
 
 #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
-        static inline void* dpct_malloc(size_t& pitch, size_t x, size_t y, size_t z,
-            sycl::queue& q)
+        static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
+            sycl::queue &q)
         {
             pitch = PITCH_DEFAULT_ALIGN(x);
             return dpct_malloc(pitch * y * z, q);
@@ -1251,7 +1251,7 @@ namespace dpct
          * @return An event representing the memset operation.
          */
         template <typename valueT>
-        static inline sycl::event dpct_memset(sycl::queue& q, void* dev_ptr,
+        static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
             valueT value, size_t size)
         {
             return q.fill(dev_ptr, value, size);
@@ -1268,15 +1268,15 @@ namespace dpct
          */
         template <typename valueT>
         static inline std::vector<sycl::event>
-            dpct_memset(sycl::queue& q, pitched_data data, valueT value,
+            dpct_memset(sycl::queue &q, pitched_data data, valueT value,
                 sycl::range<3> size)
         {
             std::vector<sycl::event> event_list;
             size_t slice = data.get_pitch() * data.get_y();
-            unsigned char* data_surface = (unsigned char*)data.get_data_ptr();
+            unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
             for (size_t z = 0; z < size.get(2); ++z)
             {
-                unsigned char* data_ptr = data_surface;
+                unsigned char *data_ptr = data_surface;
                 for (size_t y = 0; y < size.get(1); ++y)
                 {
                     event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
@@ -1300,15 +1300,15 @@ namespace dpct
          */
         template <typename valueT>
         static inline std::vector<sycl::event>
-            dpct_memset(sycl::queue& q, void* ptr, size_t pitch, valueT val, size_t x,
+            dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
                 size_t y)
         {
             return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
                 sycl::range<3>(x, y, 1));
         }
 
-        static memcpy_direction deduce_memcpy_direction(sycl::queue& q, void* to_ptr,
-            const void* from_ptr,
+        static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
+            const void *from_ptr,
             memcpy_direction dir)
         {
             switch (dir)
@@ -1342,9 +1342,9 @@ namespace dpct
         }
 
         static sycl::event
-            dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr, size_t size,
+            dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
                 memcpy_direction direction,
-                const std::vector<sycl::event>& dep_events = {})
+                const std::vector<sycl::event> &dep_events = {})
         {
             if (!size)
                 return sycl::event{};
@@ -1368,31 +1368,31 @@ namespace dpct
         /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
         /// and \p from_range to another specified by \p to_ptr and \p to_range.
         static inline std::vector<sycl::event>
-            dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr,
+            dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
                 sycl::range<3> to_range, sycl::range<3> from_range,
                 sycl::id<3> to_id, sycl::id<3> from_id,
                 sycl::range<3> size, memcpy_direction direction,
-                const std::vector<sycl::event>& dep_events = {})
+                const std::vector<sycl::event> &dep_events = {})
         {
             // RAII for host pointer
             class host_buffer
             {
-                void* _buf;
+                void *_buf;
                 size_t _size;
-                sycl::queue& _q;
-                const std::vector<sycl::event>& _deps; // free operation depends
+                sycl::queue &_q;
+                const std::vector<sycl::event> &_deps; // free operation depends
 
             public:
-                host_buffer(size_t size, sycl::queue& q,
-                    const std::vector<sycl::event>& deps)
+                host_buffer(size_t size, sycl::queue &q,
+                    const std::vector<sycl::event> &deps)
                     : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-                void* get_ptr() const { return _buf; }
+                void *get_ptr() const { return _buf; }
                 size_t get_size() const { return _size; }
                 ~host_buffer()
                 {
                     if (_buf)
                     {
-                        _q.submit([&](sycl::handler& cgh)
+                        _q.submit([&](sycl::handler &cgh)
                             {
                                 cgh.depends_on(_deps);
                                 cgh.host_task([buf = _buf] { std::free(buf); }); });
@@ -1403,10 +1403,10 @@ namespace dpct
 
             size_t to_slice = to_range.get(1) * to_range.get(0),
                 from_slice = from_range.get(1) * from_range.get(0);
-            unsigned char* to_surface =
-                (unsigned char*)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-            const unsigned char* from_surface =
-                (const unsigned char*)from_ptr +
+            unsigned char *to_surface =
+                (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+            const unsigned char *from_surface =
+                (const unsigned char *)from_ptr +
                 get_offset(from_id, from_slice, from_range.get(0));
 
             if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
@@ -1421,8 +1421,8 @@ namespace dpct
             case host_to_host:
                 for (size_t z = 0; z < size.get(2); ++z)
                 {
-                    unsigned char* to_ptr = to_surface;
-                    const unsigned char* from_ptr = from_surface;
+                    unsigned char *to_ptr = to_surface;
+                    const unsigned char *from_ptr = from_surface;
                     if (to_range.get(0) == from_range.get(0) &&
                         to_range.get(0) == size.get(0))
                     {
@@ -1489,7 +1489,7 @@ namespace dpct
                 break;
             }
             case device_to_device:
-                event_list.push_back(q.submit([&](sycl::handler& cgh) {
+                event_list.push_back(q.submit([&](sycl::handler &cgh) {
                     cgh.depends_on(dep_events);
                     cgh.parallel_for<class dpct_memcpy_3d_detail>(
                         size,
@@ -1506,7 +1506,7 @@ namespace dpct
 
         /// memcpy 2D/3D matrix specified by pitched_data.
         static inline std::vector<sycl::event>
-            dpct_memcpy(sycl::queue& q, pitched_data to, sycl::id<3> to_id,
+            dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
                 pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
                 memcpy_direction direction = automatic)
         {
@@ -1518,7 +1518,7 @@ namespace dpct
 
         /// memcpy 2D matrix with pitch.
         static inline std::vector<sycl::event>
-            dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr,
+            dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
                 size_t to_pitch, size_t from_pitch, size_t x, size_t y,
                 memcpy_direction direction = automatic)
         {
@@ -1545,9 +1545,9 @@ namespace dpct
                 using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
                 using const_void_pointer =
                     typename std::allocator_traits<Alloc>::const_void_pointer;
-                using reference = typename std::allocator_traits<Alloc>::value_type&;
+                using reference = typename std::allocator_traits<Alloc>::value_type &;
                 using const_reference =
-                    const typename std::allocator_traits<Alloc>::value_type&;
+                    const typename std::allocator_traits<Alloc>::value_type &;
                 using difference_type =
                     typename std::allocator_traits<Alloc>::difference_type;
                 using size_type = typename std::allocator_traits<Alloc>::size_type;
@@ -1568,8 +1568,8 @@ namespace dpct
 
                 usm_allocator() : _impl(dpct::get_default_queue()) {}
                 ~usm_allocator() {}
-                usm_allocator(const usm_allocator& other) : _impl(other._impl) {}
-                usm_allocator(usm_allocator&& other) : _impl(std::move(other._impl)) {}
+                usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
+                usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
                 pointer address(reference r) { return &r; }
                 const_pointer address(const_reference r) { return &r; }
                 pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
@@ -1584,14 +1584,14 @@ namespace dpct
                 {
                     return std::allocator_traits<Alloc>::max_size(_impl);
                 }
-                bool operator==(const usm_allocator& other) const { return _impl == other._impl; }
-                bool operator!=(const usm_allocator& other) const { return _impl != other._impl; }
+                bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
+                bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
             };
 
         } // namespace deprecated
 
-        inline void dpct_free(void* ptr,
-            const sycl::queue& q)
+        inline void dpct_free(void *ptr,
+            const sycl::queue &q)
         {
             if (ptr)
             {
@@ -1600,29 +1600,29 @@ namespace dpct
         }
 
         template <typename T>
-        inline auto get_memory(const void* x)
+        inline auto get_memory(const void *x)
         {
-            T* new_x = reinterpret_cast<T*>(const_cast<void*>(x));
+            T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
             return new_x;
         }
 
         template <typename T>
-        inline typename DataType<T>::T2 get_value(const T* s, sycl::queue& q)
+        inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
         {
             using Ty = typename DataType<T>::T2;
             Ty s_h;
             if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
-                detail::dpct_memcpy(q, (void*)&s_h, (const void*)s, sizeof(T), device_to_host)
+                detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host)
                 .wait();
             else
-                s_h = *reinterpret_cast<const Ty*>(s);
+                s_h = *reinterpret_cast<const Ty *>(s);
             return s_h;
         }
 
     } // namespace detail
 
     template <typename T>
-    inline auto get_value(const T* s, sycl::queue& q)
+    inline auto get_value(const T *s, sycl::queue &q)
     {
         return detail::get_value(s, q);
     }
@@ -1630,13 +1630,13 @@ namespace dpct
     namespace detail
     {
         template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_impl(sycl::queue& q, oneapi::mkl::transpose a_trans,
+        inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
             oneapi::mkl::transpose b_trans, int m, int n, int k,
-            const void* alpha, const void* a, int lda, const void* b,
-            int ldb, const void* beta, void* c, int ldc)
+            const void *alpha, const void *a, int lda, const void *b,
+            int ldb, const void *beta, void *c, int ldc)
         {
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts*>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts*>(beta), q);
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
@@ -1673,10 +1673,10 @@ namespace dpct
         };
 
         template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_batch_impl(sycl::queue& q, oneapi::mkl::transpose a_trans,
+        inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
             oneapi::mkl::transpose b_trans, int m, int n, int k,
-            const void* alpha, const void** a, int lda,
-            const void** b, int ldb, const void* beta, void** c,
+            const void *alpha, const void **a, int lda,
+            const void **b, int ldb, const void *beta, void **c,
             int ldc, int batch_size)
         {
             struct matrix_info_t
@@ -1688,11 +1688,11 @@ namespace dpct
                 std::int64_t groupsize_info;
             };
 
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts*>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts*>(beta), q);
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
 
-            matrix_info_t* matrix_info =
-                (matrix_info_t*)std::malloc(sizeof(matrix_info_t));
+            matrix_info_t *matrix_info =
+                (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
             matrix_info->transpose_info[0] = a_trans;
             matrix_info->transpose_info[1] = b_trans;
             matrix_info->value_info[0] = alpha_value;
@@ -1709,12 +1709,12 @@ namespace dpct
                 q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
                 matrix_info->size_info, matrix_info->size_info + 1,
                 matrix_info->size_info + 2, matrix_info->value_info,
-                reinterpret_cast<const Ta**>(a), matrix_info->ld_info,
-                reinterpret_cast<const Tb**>(b), matrix_info->ld_info + 1,
-                matrix_info->value_info + 1, reinterpret_cast<Tc**>(c),
+                reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
+                reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
+                matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
                 matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
 
-            q.submit([&](sycl::handler& cgh)
+            q.submit([&](sycl::handler &cgh)
                 {
                     cgh.depends_on(e);
                     cgh.host_task([=] { std::free(matrix_info); }); });
@@ -1722,15 +1722,15 @@ namespace dpct
 
         template <class Ta, class Tb, class Tc, class Ts>
         inline void
-            gemm_batch_impl(sycl::queue& q, oneapi::mkl::transpose a_trans,
+            gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
                 oneapi::mkl::transpose b_trans, int m, int n,
-                int k, const void* alpha, const void* a, int lda,
-                long long int stride_a, const void* b, int ldb,
-                long long int stride_b, const void* beta, void* c,
+                int k, const void *alpha, const void *a, int lda,
+                long long int stride_a, const void *b, int ldb,
+                long long int stride_b, const void *beta, void *c,
                 int ldc, long long int stride_c, int batch_size)
         {
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts*>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts*>(beta), q);
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
@@ -1755,9 +1755,9 @@ namespace dpct
         return v0;
     }
 
-    static void async_dpct_memcpy(void* to_ptr, const void* from_ptr, size_t size,
+    static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
         memcpy_direction direction = automatic,
-        sycl::queue& q = dpct::get_default_queue())
+        sycl::queue &q = dpct::get_default_queue())
     {
         detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
     }
@@ -1793,7 +1793,7 @@ namespace dpct
 
     template <typename T1, typename T2>
     using dot_product_acc_t =
-        std::conditional_t<std::is_unsigned_v<T1>&& std::is_unsigned_v<T2>,
+        std::conditional_t<std::is_unsigned_v<T1> &&std::is_unsigned_v<T2>,
         uint32_t, int32_t>;
 
     template <typename T1, typename T2, typename T3>
@@ -1968,10 +1968,10 @@ namespace dpct
     }
 
     inline void
-        has_capability_or_fail(const sycl::device& dev,
-            const std::initializer_list<sycl::aspect>& props)
+        has_capability_or_fail(const sycl::device &dev,
+            const std::initializer_list<sycl::aspect> &props)
     {
-        for (const auto& it : props)
+        for (const auto &it : props)
         {
             if (dev.has(it))
                 continue;
@@ -2019,20 +2019,20 @@ namespace dpct
         return dev_mgr::instance().current_device_id();
     }
 
-    static inline device_ext& get_current_device()
+    static inline device_ext &get_current_device()
     {
         return dev_mgr::instance().current_device();
     }
 
-    static inline sycl::queue& get_in_order_queue()
+    static inline sycl::queue &get_in_order_queue()
     {
         return dev_mgr::instance().current_device().in_order_queue();
     }
 
     static sycl::event
-        dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr, size_t size,
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
             memcpy_direction direction,
-            const std::vector<sycl::event>& dep_events = {})
+            const std::vector<sycl::event> &dep_events = {})
     {
         if (!size)
             return sycl::event{};
@@ -2056,31 +2056,31 @@ namespace dpct
     /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
     /// and \p from_range to another specified by \p to_ptr and \p to_range.
     static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr,
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
             sycl::range<3> to_range, sycl::range<3> from_range,
             sycl::id<3> to_id, sycl::id<3> from_id,
             sycl::range<3> size, memcpy_direction direction,
-            const std::vector<sycl::event>& dep_events = {})
+            const std::vector<sycl::event> &dep_events = {})
     {
         // RAII for host pointer
         class host_buffer
         {
-            void* _buf;
+            void *_buf;
             size_t _size;
-            sycl::queue& _q;
-            const std::vector<sycl::event>& _deps; // free operation depends
+            sycl::queue &_q;
+            const std::vector<sycl::event> &_deps; // free operation depends
 
         public:
-            host_buffer(size_t size, sycl::queue& q,
-                const std::vector<sycl::event>& deps)
+            host_buffer(size_t size, sycl::queue &q,
+                const std::vector<sycl::event> &deps)
                 : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-            void* get_ptr() const { return _buf; }
+            void *get_ptr() const { return _buf; }
             size_t get_size() const { return _size; }
             ~host_buffer()
             {
                 if (_buf)
                 {
-                    _q.submit([&](sycl::handler& cgh)
+                    _q.submit([&](sycl::handler &cgh)
                         {
                             cgh.depends_on(_deps);
                             cgh.host_task([buf = _buf] { std::free(buf); }); });
@@ -2091,10 +2091,10 @@ namespace dpct
 
         size_t to_slice = to_range.get(1) * to_range.get(0),
             from_slice = from_range.get(1) * from_range.get(0);
-        unsigned char* to_surface =
-            (unsigned char*)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-        const unsigned char* from_surface =
-            (const unsigned char*)from_ptr +
+        unsigned char *to_surface =
+            (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+        const unsigned char *from_surface =
+            (const unsigned char *)from_ptr +
             get_offset(from_id, from_slice, from_range.get(0));
 
         if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
@@ -2109,8 +2109,8 @@ namespace dpct
         case host_to_host:
             for (size_t z = 0; z < size.get(2); ++z)
             {
-                unsigned char* to_ptr = to_surface;
-                const unsigned char* from_ptr = from_surface;
+                unsigned char *to_ptr = to_surface;
+                const unsigned char *from_ptr = from_surface;
                 if (to_range.get(0) == from_range.get(0) &&
                     to_range.get(0) == size.get(0))
                 {
@@ -2177,7 +2177,7 @@ namespace dpct
             break;
         }
         case device_to_device:
-            event_list.push_back(q.submit([&](sycl::handler& cgh)
+            event_list.push_back(q.submit([&](sycl::handler &cgh)
                 {
                     cgh.depends_on(dep_events);
                     cgh.parallel_for<class dpct_memcpy_3d_detail>(
@@ -2195,7 +2195,7 @@ namespace dpct
 
     /// memcpy 2D/3D matrix specified by pitched_data.
     static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue& q, pitched_data to, sycl::id<3> to_id,
+        dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
             pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
             memcpy_direction direction = automatic)
     {
@@ -2207,7 +2207,7 @@ namespace dpct
 
     /// memcpy 2D matrix with pitch.
     static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue& q, void* to_ptr, const void* from_ptr,
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
             size_t to_pitch, size_t from_pitch, size_t x, size_t y,
             memcpy_direction direction = automatic)
     {
@@ -2217,11 +2217,11 @@ namespace dpct
             sycl::range<3>(x, y, 1), direction);
     }
 
-    inline void gemm(sycl::queue& q, oneapi::mkl::transpose a_trans,
+    inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
         oneapi::mkl::transpose b_trans, int m, int n, int k,
-        const void* alpha, const void* a, library_data_t a_type,
-        int lda, const void* b, library_data_t b_type, int ldb,
-        const void* beta, void* c, library_data_t c_type, int ldc,
+        const void *alpha, const void *a, library_data_t a_type,
+        int lda, const void *b, library_data_t b_type, int ldb,
+        const void *beta, void *c, library_data_t c_type, int ldc,
         library_data_t scaling_type)
     {
         if (scaling_type == library_data_t::real_float &&
@@ -2305,9 +2305,9 @@ namespace dpct
             library_data_t::real_half, library_data_t::real_float):
             {
                 float alpha_value =
-                    dpct::get_value(reinterpret_cast<const float*>(alpha), q);
+                    dpct::get_value(reinterpret_cast<const float *>(alpha), q);
                 float beta_value =
-                    dpct::get_value(reinterpret_cast<const float*>(beta), q);
+                    dpct::get_value(reinterpret_cast<const float *>(beta), q);
                 sycl::half alpha_half(alpha_value);
                 sycl::half beta_half(beta_value);
                 detail::gemm_impl<sycl::half, sycl::half, sycl::half,
@@ -2337,9 +2337,9 @@ namespace dpct
             library_data_t::real_int32, library_data_t::real_int32):
             {
                 float alpha_float =
-                    dpct::get_value(reinterpret_cast<const std::int32_t*>(alpha), q);
+                    dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
                 float beta_float =
-                    dpct::get_value(reinterpret_cast<const std::int32_t*>(beta), q);
+                    dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
                 detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
                     q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
                 break;
@@ -2370,12 +2370,12 @@ namespace dpct
     /// \param [in] ldc Leading dimension of C.
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue& q, oneapi::mkl::transpose a_trans,
+    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
         oneapi::mkl::transpose b_trans, int m, int n, int k,
-        const void* alpha, const void* a[],
-        library_data_t a_type, int lda, const void* b[],
-        library_data_t b_type, int ldb, const void* beta,
-        void* c[], library_data_t c_type, int ldc,
+        const void *alpha, const void *a[],
+        library_data_t a_type, int lda, const void *b[],
+        library_data_t b_type, int ldb, const void *beta,
+        void *c[], library_data_t c_type, int ldc,
         int batch_size, library_data_t scaling_type)
     {
         if (scaling_type == library_data_t::real_float &&
@@ -2466,9 +2466,9 @@ namespace dpct
             library_data_t::real_int32, library_data_t::real_int32):
             {
                 float alpha_float =
-                    dpct::get_value(reinterpret_cast<const std::int32_t*>(alpha), q);
+                    dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
                 float beta_float =
-                    dpct::get_value(reinterpret_cast<const std::int32_t*>(beta), q);
+                    dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
                 detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
                     float>(q, a_trans, b_trans, m, n, k, &alpha_float,
                         a, lda, b, ldb, &beta_float, c, ldc,
@@ -2499,9 +2499,9 @@ namespace dpct
             library_data_t::real_half, library_data_t::real_float):
             {
                 float alpha_value =
-                    dpct::get_value(reinterpret_cast<const float*>(alpha), q);
+                    dpct::get_value(reinterpret_cast<const float *>(alpha), q);
                 float beta_value =
-                    dpct::get_value(reinterpret_cast<const float*>(beta), q);
+                    dpct::get_value(reinterpret_cast<const float *>(beta), q);
                 sycl::half alpha_half(alpha_value);
                 sycl::half beta_half(beta_value);
                 detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
@@ -2537,12 +2537,12 @@ namespace dpct
     /// \param [in] stride_c Stride between the different C matrices.
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue& q, oneapi::mkl::transpose a_trans,
+    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
         oneapi::mkl::transpose b_trans, int m, int n, int k,
-        const void* alpha, const void* a, library_data_t a_type,
-        int lda, long long int stride_a, const void* b,
+        const void *alpha, const void *a, library_data_t a_type,
+        int lda, long long int stride_a, const void *b,
         library_data_t b_type, int ldb, long long int stride_b,
-        const void* beta, void* c, library_data_t c_type,
+        const void *beta, void *c, library_data_t c_type,
         int ldc, long long int stride_c, int batch_size,
         library_data_t scaling_type)
     {
@@ -2664,9 +2664,9 @@ namespace dpct
             library_data_t::real_half, library_data_t::real_float):
             {
                 float alpha_value =
-                    dpct::get_value(reinterpret_cast<const float*>(alpha), q);
+                    dpct::get_value(reinterpret_cast<const float *>(alpha), q);
                 float beta_value =
-                    dpct::get_value(reinterpret_cast<const float*>(beta), q);
+                    dpct::get_value(reinterpret_cast<const float *>(beta), q);
                 sycl::half alpha_half(alpha_value);
                 sycl::half beta_half(beta_value);
                 detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
@@ -2680,10 +2680,10 @@ namespace dpct
     }
 
     static inline void
-        async_dpct_memcpy(void* to_ptr, size_t to_pitch, const void* from_ptr,
+        async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
             size_t from_pitch, size_t x, size_t y,
             memcpy_direction direction = automatic,
-            sycl::queue& q = get_default_queue())
+            sycl::queue &q = get_default_queue())
     {
         detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
             direction);
@@ -2692,7 +2692,7 @@ namespace dpct
     using err0 = detail::generic_error_type<struct err0_tag, int>;
     using err1 = detail::generic_error_type<struct err1_tag, int>;
 
-    static inline void dpct_free(void* ptr, sycl::queue& q = get_default_queue()) {
+    static inline void dpct_free(void *ptr, sycl::queue &q = get_default_queue()) {
         detail::dpct_free(ptr, q);
     }
 
@@ -2704,12 +2704,12 @@ namespace dpct
         using element_t = typename memory_t::element_t;
         using pointer_t = typename memory_t::pointer_t;
         using accessor_t = typename memory_t::template accessor_t<3>;
-        accessor(pointer_t data, const sycl::range<3>& in_range)
+        accessor(pointer_t data, const sycl::range<3> &in_range)
             : _data(data), _range(in_range) {}
         template <memory_region M = Memory>
-        accessor(typename std::enable_if<M != local, const accessor_t>::type& acc)
+        accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
             : accessor(acc, acc.get_range()) {}
-        accessor(const accessor_t& acc, const sycl::range<3>& in_range)
+        accessor(const accessor_t &acc, const sycl::range<3> &in_range)
             : accessor(acc.get_pointer(), in_range) {}
         accessor<T, Memory, 2> operator[](size_t index) const {
             sycl::range<2> sub(_range.get(1), _range.get(2));
@@ -2728,12 +2728,12 @@ namespace dpct
         using element_t = typename memory_t::element_t;
         using pointer_t = typename memory_t::pointer_t;
         using accessor_t = typename memory_t::template accessor_t<2>;
-        accessor(pointer_t data, const sycl::range<2>& in_range)
+        accessor(pointer_t data, const sycl::range<2> &in_range)
             : _data(data), _range(in_range) {}
         template <memory_region M = Memory>
-        accessor(typename std::enable_if<M != local, const accessor_t>::type& acc)
+        accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
             : accessor(acc, acc.get_range()) {}
-        accessor(const accessor_t& acc, const sycl::range<2>& in_range)
+        accessor(const accessor_t &acc, const sycl::range<2> &in_range)
             : accessor(acc.get_pointer(), in_range) {}
 
         pointer_t operator[](size_t index) const {
@@ -2760,11 +2760,11 @@ namespace dpct
             device_memory() : device_memory(sycl::range<Dimension>(1)) {}
 
             /// Constructor of 1-D array with initializer list
-            device_memory(const sycl::range<Dimension>& in_range,
-                std::initializer_list<value_t>&& init_list)
+            device_memory(const sycl::range<Dimension> &in_range,
+                std::initializer_list<value_t> &&init_list)
                 : device_memory(in_range) {
                 assert(init_list.size() <= in_range.size());
-                _host_ptr = (value_t*)std::malloc(_size);
+                _host_ptr = (value_t *)std::malloc(_size);
                 std::memset(_host_ptr, 0, _size);
                 std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
             }
@@ -2772,11 +2772,11 @@ namespace dpct
             /// Constructor of 2-D array with initializer list
             template <size_t D = Dimension>
             device_memory(
-                const typename std::enable_if<D == 2, sycl::range<2>>::type& in_range,
-                std::initializer_list<std::initializer_list<value_t>>&& init_list)
+                const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
+                std::initializer_list<std::initializer_list<value_t>> &&init_list)
                 : device_memory(in_range) {
                 assert(init_list.size() <= in_range[0]);
-                _host_ptr = (value_t*)std::malloc(_size);
+                _host_ptr = (value_t *)std::malloc(_size);
                 std::memset(_host_ptr, 0, _size);
                 auto tmp_data = _host_ptr;
                 for (auto sub_list : init_list) {
@@ -2788,7 +2788,7 @@ namespace dpct
             }
 
             /// Constructor with range
-            device_memory(const sycl::range<Dimension>& range_in)
+            device_memory(const sycl::range<Dimension> &range_in)
                 : _size(range_in.size() * sizeof(T)), _range(range_in),
                 _reference(false), _host_ptr(nullptr), _device_ptr(nullptr) {
                 static_assert(
@@ -2817,7 +2817,7 @@ namespace dpct
             void init() { init(dpct::get_default_queue()); }
             /// Allocate memory with specified queue, and init memory if has initial
             /// value.
-            void init(sycl::queue& q) {
+            void init(sycl::queue &q) {
                 if (_device_ptr)
                     return;
                 if (!_size)
@@ -2829,17 +2829,17 @@ namespace dpct
             }
 
             /// The variable is assigned to a device pointer.
-            void assign(value_t* src, size_t size) {
+            void assign(value_t *src, size_t size) {
                 this->~device_memory();
                 new (this) device_memory(src, size);
             }
 
             /// Get memory pointer of the memory object, which is virtual pointer when
             /// usm is not used, and device pointer when usm is used.
-            value_t* get_ptr() { return get_ptr(get_default_queue()); }
+            value_t *get_ptr() { return get_ptr(get_default_queue()); }
             /// Get memory pointer of the memory object, which is virtual pointer when
             /// usm is not used, and device pointer when usm is used.
-            value_t* get_ptr(sycl::queue& q) {
+            value_t *get_ptr(sycl::queue &q) {
                 init(q);
                 return _device_ptr;
             }
@@ -2848,7 +2848,7 @@ namespace dpct
             size_t get_size() { return _size; }
 
             template <size_t D = Dimension>
-            typename std::enable_if<D == 1, T>::type& operator[](size_t index) {
+            typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
                 init();
                 return _device_ptr[index];
             }
@@ -2857,39 +2857,39 @@ namespace dpct
             /// when usm is used and dimension is greater than 1.
             template <size_t D = Dimension>
             typename std::enable_if<D != 1, dpct_accessor_t>::type
-                get_access([[maybe_unused]] sycl::handler& cgh) {
-                return dpct_accessor_t((T*)_device_ptr, _range);
+                get_access([[maybe_unused]] sycl::handler &cgh) {
+                return dpct_accessor_t((T *)_device_ptr, _range);
             }
 
         private:
-            device_memory(value_t* memory_ptr, size_t size)
+            device_memory(value_t *memory_ptr, size_t size)
                 : _size(size), _range(size / sizeof(T)), _reference(true),
                 _device_ptr(memory_ptr) {}
 
-            void allocate_device(sycl::queue& q) {
+            void allocate_device(sycl::queue &q) {
 #ifndef DPCT_USM_LEVEL_NONE
                 if (Memory == shared) {
-                    _device_ptr = (value_t*)sycl::malloc_shared(_size, q.get_device(),
+                    _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
                         q.get_context());
                     return;
                 }
 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
                 if (Memory == constant) {
-                    _device_ptr = (value_t*)sycl::malloc_device(
+                    _device_ptr = (value_t *)sycl::malloc_device(
                         _size, q.get_device(), q.get_context(),
                         sycl::ext::oneapi::property::usm::device_read_only());
                     return;
                 }
 #endif
 #endif
-                _device_ptr = (value_t*)detail::dpct_malloc(_size, q);
+                _device_ptr = (value_t *)detail::dpct_malloc(_size, q);
             }
 
             size_t _size;
             sycl::range<Dimension> _range;
             bool _reference;
-            value_t* _host_ptr;
-            value_t* _device_ptr;
+            value_t *_host_ptr;
+            value_t *_device_ptr;
         };
         template <class T, memory_region Memory>
         class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
@@ -2900,7 +2900,7 @@ namespace dpct
                 typename detail::memory_traits<Memory, T>::template accessor_t<0>;
 
             /// Constructor with initial value.
-            device_memory(const value_t& val) : base(sycl::range<1>(1), { val }) {}
+            device_memory(const value_t &val) : base(sycl::range<1>(1), { val }) {}
 
             /// Default constructor
             device_memory() : base(1) {}
@@ -2920,7 +2920,7 @@ namespace dpct
         sycl::access::address_space::global_space,
         sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
         sycl::memory_scope memoryScope = sycl::memory_scope::device>
-    inline T atomic_fetch_add(T* addr, T operand) {
+    inline T atomic_fetch_add(T *addr, T operand) {
         auto atm =
             sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
         return atm.fetch_add(operand);
@@ -2931,7 +2931,7 @@ namespace dpct
         sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
         sycl::memory_scope memoryScope = sycl::memory_scope::device,
         typename T1, typename T2>
-    inline T1 atomic_fetch_add(T1* addr, T2 operand) {
+    inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
         auto atm =
             sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
         return atm.fetch_add(operand);
@@ -2939,7 +2939,7 @@ namespace dpct
 
     template <typename T, sycl::access::address_space addressSpace =
         sycl::access::address_space::global_space>
-    inline T atomic_fetch_add(T* addr, T operand,
+    inline T atomic_fetch_add(T *addr, T operand,
         sycl::memory_order memoryOrder) {
         switch (memoryOrder) {
         case sycl::memory_order::relaxed:
@@ -2961,7 +2961,7 @@ namespace dpct
     template <sycl::access::address_space addressSpace =
         sycl::access::address_space::global_space,
         typename T1, typename T2>
-    inline T1 atomic_fetch_add(T1* addr, T2 operand,
+    inline T1 atomic_fetch_add(T1 *addr, T2 operand,
         sycl::memory_order memoryOrder) {
         atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
     }

From e1eabdc2e418b6e77ac83f54d3741389f222f90f Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 19 Jun 2024 08:22:02 +0000
Subject: [PATCH 04/11] revert format change

---
 ggml-sycl/dpct/helper.hpp | 894 ++++++++++++++++++--------------------
 1 file changed, 429 insertions(+), 465 deletions(-)

diff --git a/ggml-sycl/dpct/helper.hpp b/ggml-sycl/dpct/helper.hpp
index 97ff5b39d819f..6275993261792 100644
--- a/ggml-sycl/dpct/helper.hpp
+++ b/ggml-sycl/dpct/helper.hpp
@@ -77,7 +77,7 @@ inline std::string get_device_type_name(const sycl::device &Device) {
 inline std::string get_device_backend_and_type(const sycl::device &device) {
     std::stringstream device_type;
     sycl::backend backend = device.get_backend();
-    device_type << backend << ":" << get_device_type_name(device);
+    device_type <<  backend << ":" << get_device_type_name(device);
     return device_type.str();
 }
 
@@ -220,8 +220,7 @@ namespace dpct
                 // a. and b.
                 i++;
                 minor = std::stoi(&(ver[i]));
-            }
-            else {
+            } else {
                 // c.
                 minor = 0;
             }
@@ -232,7 +231,7 @@ namespace dpct
         {
         public:
             generic_error_type() = default;
-            generic_error_type(T value) : value{ value } {}
+            generic_error_type(T value) : value{value} {}
             operator T() const { return value; }
 
         private:
@@ -241,7 +240,7 @@ namespace dpct
 
     } // namespace detail
 
-    /// Pitched 2D/3D memory data.
+      /// Pitched 2D/3D memory data.
     class pitched_data
     {
     public:
@@ -577,7 +576,7 @@ namespace dpct
 
         prop.set_max_work_items_per_compute_unit(
             dev.get_info<sycl::info::device::max_work_group_size>());
-        int max_nd_range_size[] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
+        int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
         prop.set_max_nd_range_size(max_nd_range_size);
 
         // Estimates max register size per work group, feel free to update the value
@@ -590,94 +589,75 @@ namespace dpct
     }
 
     /// dpct device extension
-    class device_ext : public sycl::device
-    {
+    class device_ext : public sycl::device {
         typedef std::mutex mutex_type;
 
     public:
         device_ext() : sycl::device() {}
-        ~device_ext()
-        {
+        ~device_ext() {
             std::lock_guard<mutex_type> lock(m_mutex);
             clear_queues();
         }
-        device_ext(const sycl::device &base) : sycl::device(base)
-        {
+        device_ext(const sycl::device &base) : sycl::device(base) {
             std::lock_guard<mutex_type> lock(m_mutex);
             init_queues();
         }
 
         int is_native_atomic_supported() { return 0; }
-        int get_major_version() const
-        {
-            return dpct::get_major_version(*this);
-        }
+        int get_major_version() const { return dpct::get_major_version(*this); }
 
-        int get_minor_version() const
-        {
-            return dpct::get_minor_version(*this);
-        }
+        int get_minor_version() const { return dpct::get_minor_version(*this); }
 
-        int get_max_compute_units() const
-        {
+        int get_max_compute_units() const {
             return get_device_info().get_max_compute_units();
         }
 
         /// Return the maximum clock frequency of this device in KHz.
-        int get_max_clock_frequency() const
-        {
+        int get_max_clock_frequency() const {
             return get_device_info().get_max_clock_frequency();
         }
 
         int get_integrated() const { return get_device_info().get_integrated(); }
 
-        int get_max_sub_group_size() const
-        {
+        int get_max_sub_group_size() const {
             return get_device_info().get_max_sub_group_size();
         }
 
-        int get_max_register_size_per_work_group() const
-        {
+        int get_max_register_size_per_work_group() const {
             return get_device_info().get_max_register_size_per_work_group();
         }
 
-        int get_max_work_group_size() const
-        {
+        int get_max_work_group_size() const {
             return get_device_info().get_max_work_group_size();
         }
 
-        int get_mem_base_addr_align() const
-        {
+        int get_mem_base_addr_align() const {
             return get_info<sycl::info::device::mem_base_addr_align>();
         }
 
-        size_t get_global_mem_size() const
-        {
+        size_t get_global_mem_size() const {
             return get_device_info().get_global_mem_size();
         }
 
-        size_t get_max_mem_alloc_size() const
-        {
+        size_t get_max_mem_alloc_size() const {
             return get_device_info().get_max_mem_alloc_size();
         }
 
         /// Get the number of bytes of free and total memory on the SYCL device.
-        /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
-        /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
-        void get_memory_info(size_t &free_memory, size_t &total_memory)
-        {
+        /// \param [out] free_memory The number of bytes of free memory on the
+        /// SYCL device. \param [out] total_memory The number of bytes of total
+        /// memory on the SYCL device.
+        void get_memory_info(size_t &free_memory, size_t &total_memory) {
             total_memory = get_device_info().get_global_mem_size();
-            const char *warning_info = "get_memory_info: [warning] ext_intel_free_memory is not "
+            const char *warning_info =
+                "get_memory_info: [warning] ext_intel_free_memory is not "
                 "supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
                 "use total memory as free memory";
 #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
-            if (!has(sycl::aspect::ext_intel_free_memory))
-            {
+            if (!has(sycl::aspect::ext_intel_free_memory)) {
                 std::cerr << warning_info << std::endl;
                 free_memory = total_memory;
-            }
-            else
-            {
+            } else {
                 free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
             }
 #else
@@ -691,20 +671,17 @@ namespace dpct
 #endif
         }
 
-        void get_device_info(device_info &out) const
-        {
+        void get_device_info(device_info &out) const {
             dpct::get_device_info(out, *this);
         }
 
-        device_info get_device_info() const
-        {
+        device_info get_device_info() const {
             device_info prop;
             dpct::get_device_info(prop, *this);
             return prop;
         }
 
-        void reset()
-        {
+        void reset() {
             std::lock_guard<mutex_type> lock(m_mutex);
             clear_queues();
             init_queues();
@@ -714,25 +691,20 @@ namespace dpct
 
         sycl::queue &out_of_order_queue() { return _q_out_of_order; }
 
-        sycl::queue &default_queue()
-        {
-            return in_order_queue();
-        }
+        sycl::queue &default_queue() { return in_order_queue(); }
 
-        void queues_wait_and_throw()
-        {
+        void queues_wait_and_throw() {
             std::unique_lock<mutex_type> lock(m_mutex);
             lock.unlock();
-            for (auto &q : _queues)
-            {
+            for (auto &q : _queues) {
                 q.wait_and_throw();
             }
-            // Guard the destruct of current_queues to make sure the ref count is safe.
+            // Guard the destruct of current_queues to make sure the ref count is
+            // safe.
             lock.lock();
         }
 
-        sycl::queue create_queue(bool enable_exception_handler = false)
-        {
+        sycl::queue create_queue(bool enable_exception_handler = false) {
             return create_in_order_queue(enable_exception_handler);
         }
 
@@ -754,52 +726,45 @@ namespace dpct
                 sycl::property::queue::in_order());
         }
 
-        sycl::queue create_out_of_order_queue(bool enable_exception_handler = false) {
+        sycl::queue create_out_of_order_queue(
+            bool enable_exception_handler = false) {
             std::lock_guard<mutex_type> lock(m_mutex);
             return create_queue_impl(enable_exception_handler);
         }
 
-        void destroy_queue(sycl::queue queue)
-        {
+        void destroy_queue(sycl::queue queue) {
             std::lock_guard<mutex_type> lock(m_mutex);
             _queues.clear();
         }
-        void set_saved_queue(sycl::queue q)
-        {
+        void set_saved_queue(sycl::queue q) {
             std::lock_guard<mutex_type> lock(m_mutex);
             _saved_queue = q;
         }
-        sycl::queue get_saved_queue() const
-        {
+        sycl::queue get_saved_queue() const {
             std::lock_guard<mutex_type> lock(m_mutex);
             return _saved_queue;
         }
 
     private:
-        void clear_queues()
-        {
-            _queues.clear();
-        }
+        void clear_queues() { _queues.clear(); }
 
-        void init_queues()
-        {
-            _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
+        void init_queues() {
+            _q_in_order =
+                create_queue_impl(true, sycl::property::queue::in_order());
             _q_out_of_order = create_queue_impl(true);
             _saved_queue = default_queue();
         }
 
-        /// Caller should acquire resource \p m_mutex before calling this function.
+        /// Caller should acquire resource \p m_mutex before calling this
+        /// function.
         template <class... Properties>
         sycl::queue create_queue_impl(bool enable_exception_handler,
-            Properties... properties)
-        {
+            Properties... properties) {
             sycl::async_handler eh = {};
-            if (enable_exception_handler)
-            {
+            if (enable_exception_handler) {
                 eh = exception_handler;
             }
-            auto q = sycl::queue(
-                *this, eh,
+            auto q = sycl::queue(*this, eh,
                 sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
                     sycl::property::queue::enable_profiling(),
@@ -818,19 +783,18 @@ namespace dpct
             if (enable_exception_handler) {
                 eh = exception_handler;
             }
-            _queues.push_back(sycl::queue(
-                device, eh,
-                sycl::property_list(
+            _queues.push_back(
+                sycl::queue(device, eh,
+                    sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
-                    sycl::property::queue::enable_profiling(),
+                        sycl::property::queue::enable_profiling(),
 #endif
-                    properties...)));
+                        properties...)));
 
             return _queues.back();
         }
 
-        void get_version(int &major, int &minor) const
-        {
+        void get_version(int &major, int &minor) const {
             detail::get_version(*this, major, minor);
         }
         sycl::queue _q_in_order, _q_out_of_order;
@@ -929,15 +893,15 @@ namespace dpct
             sycl::backend backend1 = device1.get_backend();
             sycl::backend backend2 = device2.get_backend();
             // levelzero backends always come first
-            if (backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true;
-            if (backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false;
+            if(backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true;
+            if(backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false;
             dpct::device_info prop1;
             dpct::get_device_info(prop1, device1);
             dpct::device_info prop2;
             dpct::get_device_info(prop2, device2);
             return prop1.get_max_compute_units() > prop2.get_max_compute_units();
         }
-        static int convert_backend_index(std::string &backend) {
+        static int convert_backend_index(std::string & backend) {
             if (backend == "ext_oneapi_level_zero:gpu") return 0;
             if (backend == "opencl:gpu") return 1;
             if (backend == "ext_oneapi_cuda:gpu") return 2;
@@ -977,7 +941,7 @@ namespace dpct
             }
 
             std::vector<std::string> keys;
-            for (auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
+            for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
                 keys.push_back(it->first);
             }
             std::sort(keys.begin(), keys.end(), compare_backend);
@@ -1132,7 +1096,7 @@ namespace dpct
                 // Allocation
                 sycl::range<1> r(size);
                 buffer_t buf(r);
-                allocation A{ buf, next_free, size };
+                allocation A{buf, next_free, size};
                 // Map allocation to device pointer
                 void *result = next_free;
                 m_map.emplace(next_free + size, A);
@@ -1242,14 +1206,14 @@ namespace dpct
         }
 
         /**
-         * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
-         * @tparam valueT The type of the element to be set.
-         * @param [in] q The queue in which the operation is done.
-         * @param [in] dev_ptr Pointer to the virtual device memory address.
-         * @param [in] value The value to be set.
-         * @param [in] size Number of elements to be set to the value.
-         * @return An event representing the memset operation.
-         */
+        * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
+        * @tparam valueT The type of the element to be set.
+        * @param [in] q The queue in which the operation is done.
+        * @param [in] dev_ptr Pointer to the virtual device memory address.
+        * @param [in] value The value to be set.
+        * @param [in] size Number of elements to be set to the value.
+        * @return An event representing the memset operation.
+        */
         template <typename valueT>
         static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
             valueT value, size_t size)
@@ -1258,14 +1222,14 @@ namespace dpct
         }
 
         /**
-         * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
-         * @tparam valueT The type of the element to be set.
-         * @param [in] q The queue in which the operation is done.
-         * @param [in] data Pointer to the pitched device memory region.
-         * @param [in] value The value to be set.
-         * @param [in] size 3D memory region by number of elements.
-         * @return An event list representing the memset operations.
-         */
+        * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
+        * @tparam valueT The type of the element to be set.
+        * @param [in] q The queue in which the operation is done.
+        * @param [in] data Pointer to the pitched device memory region.
+        * @param [in] value The value to be set.
+        * @param [in] size 3D memory region by number of elements.
+        * @return An event list representing the memset operations.
+        */
         template <typename valueT>
         static inline std::vector<sycl::event>
             dpct_memset(sycl::queue &q, pitched_data data, valueT value,
@@ -1288,16 +1252,16 @@ namespace dpct
         }
 
         /**
-         * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
-         * @tparam valueT The type of the element to be set.
-         * @param [in] q The queue in which the operation is done.
-         * @param [in] ptr Pointer to the virtual device memory.
-         * @param [in] pitch The pitch size by number of elements, including padding.
-         * @param [in] val The value to be set.
-         * @param [in] x The width of memory region by number of elements.
-         * @param [in] y The height of memory region by number of elements.
-         * @return An event list representing the memset operations.
-         */
+        * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
+        * @tparam valueT The type of the element to be set.
+        * @param [in] q The queue in which the operation is done.
+        * @param [in] ptr Pointer to the virtual device memory.
+        * @param [in] pitch The pitch size by number of elements, including padding.
+        * @param [in] val The value to be set.
+        * @param [in] x The width of memory region by number of elements.
+        * @param [in] y The height of memory region by number of elements.
+        * @return An event list representing the memset operations.
+        */
         template <typename valueT>
         static inline std::vector<sycl::event>
             dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
@@ -1319,20 +1283,20 @@ namespace dpct
             case memcpy_direction::device_to_device:
                 return dir;
             case memcpy_direction::automatic:
-            {
+                {
                 // table[to_attribute][from_attribute]
                 static const memcpy_direction
                     direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
                     [static_cast<unsigned>(pointer_access_attribute::end)] =
-                { {memcpy_direction::host_to_host,
-                  memcpy_direction::device_to_host,
-                  memcpy_direction::host_to_host},
-                 {memcpy_direction::host_to_device,
-                  memcpy_direction::device_to_device,
-                  memcpy_direction::device_to_device},
-                 {memcpy_direction::host_to_host,
-                  memcpy_direction::device_to_device,
-                  memcpy_direction::device_to_device} };
+                {{memcpy_direction::host_to_host,
+                    memcpy_direction::device_to_host,
+                    memcpy_direction::host_to_host},
+                    {memcpy_direction::host_to_device,
+                    memcpy_direction::device_to_device,
+                    memcpy_direction::device_to_device},
+                    {memcpy_direction::host_to_host,
+                    memcpy_direction::device_to_device,
+                    memcpy_direction::device_to_device}};
                 return direction_table[static_cast<unsigned>(get_pointer_attribute(
                     q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
             }
@@ -1411,8 +1375,8 @@ namespace dpct
 
             if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
             {
-                return { dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                                    direction, dep_events) };
+                return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                    direction, dep_events)};
             }
             direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
             size_t size_slice = size.get(1) * size.get(0);
@@ -1444,7 +1408,7 @@ namespace dpct
                 }
                 break;
             case host_to_device:
-            {
+                {
                 host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
                     event_list);
                 std::vector<sycl::event> host_events;
@@ -1475,7 +1439,7 @@ namespace dpct
                 break;
             }
             case device_to_host:
-            {
+                {
                 host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
                     event_list);
                 // Copy from host temp buffer to host target with reshaping.
@@ -1489,7 +1453,7 @@ namespace dpct
                 break;
             }
             case device_to_device:
-                event_list.push_back(q.submit([&](sycl::handler &cgh) {
+                event_list.push_back(q.submit([&](sycl::handler &cgh){
                     cgh.depends_on(dep_events);
                     cgh.parallel_for<class dpct_memcpy_3d_detail>(
                         size,
@@ -1746,7 +1710,7 @@ namespace dpct
     inline unsigned vectorized_binary(unsigned a, unsigned b,
         const BinaryOperation binary_op)
     {
-        sycl::vec<unsigned, 1> v0{ a }, v1{ b };
+        sycl::vec<unsigned, 1> v0{a}, v1{b};
         auto v2 = v0.as<VecT>();
         auto v3 = v1.as<VecT>();
         auto v4 =
@@ -1793,7 +1757,7 @@ namespace dpct
 
     template <typename T1, typename T2>
     using dot_product_acc_t =
-        std::conditional_t<std::is_unsigned_v<T1> &&std::is_unsigned_v<T2>,
+        std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
         uint32_t, int32_t>;
 
     template <typename T1, typename T2, typename T3>
@@ -1821,7 +1785,7 @@ namespace dpct
     template <typename S, typename T>
     inline T vectorized_min(T a, T b)
     {
-        sycl::vec<T, 1> v0{ a }, v1{ b };
+        sycl::vec<T, 1> v0{a}, v1{b};
         auto v2 = v0.template as<S>();
         auto v3 = v1.template as<S>();
         auto v4 = sycl::min(v2, v3);
@@ -2099,8 +2063,8 @@ namespace dpct
 
         if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
         {
-            return { dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                                direction, dep_events) };
+            return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                direction, dep_events)};
         }
         direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
         size_t size_slice = size.get(1) * size.get(0);
@@ -2132,7 +2096,7 @@ namespace dpct
             }
             break;
         case host_to_device:
-        {
+            {
             host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
                 event_list);
             std::vector<sycl::event> host_events;
@@ -2163,7 +2127,7 @@ namespace dpct
             break;
         }
         case device_to_host:
-        {
+            {
             host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
                 event_list);
             // Copy from host temp buffer to host target with reshaping.
@@ -2242,134 +2206,134 @@ namespace dpct
             case detail::get_type_combination_id(
                 library_data_t::real_float, library_data_t::real_float,
             library_data_t::real_float, library_data_t::real_float):
-            {
+        {
                 detail::gemm_impl<float, float, float, float>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
                 break;
             }
-            case detail::get_type_combination_id(
-                library_data_t::real_double, library_data_t::real_double,
-            library_data_t::real_double, library_data_t::real_double):
-            {
-                detail::gemm_impl<double, double, double, double>(
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+        library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+        library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_impl<std::complex<float>, std::complex<float>,
+                std::complex<float>, std::complex<float>>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-            {
-                detail::gemm_impl<std::complex<float>, std::complex<float>,
-                    std::complex<float>, std::complex<float>>(
-                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-            {
-                detail::gemm_impl<std::complex<double>, std::complex<double>,
-                    std::complex<double>, std::complex<double>>(
-                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_half):
-            {
-                detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
-                        lda, b, ldb, beta, c, ldc);
-                break;
-            }
-#ifdef __INTEL_MKL__
-            case detail::get_type_combination_id(
-                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_float, library_data_t::real_float):
-            {
-                detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
-                        ldb, beta, c, ldc);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_float, library_data_t::real_float):
-            {
-                detail::gemm_impl<sycl::half, sycl::half, float, float>(
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+        library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_impl<std::complex<double>, std::complex<double>,
+                std::complex<double>, std::complex<double>>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_float):
-            {
-                float alpha_value =
-                    dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-                float beta_value =
-                    dpct::get_value(reinterpret_cast<const float *>(beta), q);
-                sycl::half alpha_half(alpha_value);
-                sycl::half beta_half(beta_value);
-                detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                    sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
-                        a, lda, b, ldb, &beta_half, c, ldc);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_float, library_data_t::real_float):
-            {
-                detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
+                    lda, b, ldb, beta, c, ldc);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
+                    ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
+                    a, lda, b, ldb, &beta_half, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                oneapi::mkl::bfloat16, float>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_bfloat16, library_data_t::real_float):
-            {
-                detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                    oneapi::mkl::bfloat16, float>(
-                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_int32, library_data_t::real_int32):
-            {
-                float alpha_float =
-                    dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-                float beta_float =
-                    dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-                detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
-                    q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
-                break;
-            }
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_int32, library_data_t::real_int32):
+        {
+            float alpha_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+            float beta_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+            detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
+                q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
+            break;
+        }
 #endif // __INTEL_MKL__
-            default:
-                throw std::runtime_error("the combination of data type is unsupported");
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
         }
     } // gemm()
 
-    /// Computes a batch of matrix-matrix product with general matrices.
-    /// \param [in] q The queue where the routine should be executed.
-    /// \param [in] a_trans Specifies the operation applied to A.
-    /// \param [in] b_trans Specifies the operation applied to B.
-    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-    /// \param [in] alpha Scaling factor for the matrix-matrix product.
-    /// \param [in] a Input matrix A.
-    /// \param [in] a_type Data type of the matrix A.
-    /// \param [in] lda Leading dimension of A.
-    /// \param [in] b Input matrix B.
-    /// \param [in] b_type Data type of the matrix B.
-    /// \param [in] ldb Leading dimension of B.
-    /// \param [in] beta Scaling factor for matrix C.
-    /// \param [in, out] c Input/Output matrix C.
-    /// \param [in] c_type Data type of the matrix C.
-    /// \param [in] ldc Leading dimension of C.
-    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-    /// \param [in] scaling_type Data type of the scaling factors.
+      /// Computes a batch of matrix-matrix product with general matrices.
+      /// \param [in] q The queue where the routine should be executed.
+      /// \param [in] a_trans Specifies the operation applied to A.
+      /// \param [in] b_trans Specifies the operation applied to B.
+      /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+      /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+      /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+      /// \param [in] alpha Scaling factor for the matrix-matrix product.
+      /// \param [in] a Input matrix A.
+      /// \param [in] a_type Data type of the matrix A.
+      /// \param [in] lda Leading dimension of A.
+      /// \param [in] b Input matrix B.
+      /// \param [in] b_type Data type of the matrix B.
+      /// \param [in] ldb Leading dimension of B.
+      /// \param [in] beta Scaling factor for matrix C.
+      /// \param [in, out] c Input/Output matrix C.
+      /// \param [in] c_type Data type of the matrix C.
+      /// \param [in] ldc Leading dimension of C.
+      /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+      /// \param [in] scaling_type Data type of the scaling factors.
     inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
         oneapi::mkl::transpose b_trans, int m, int n, int k,
         const void *alpha, const void *a[],
@@ -2396,121 +2360,121 @@ namespace dpct
             case detail::get_type_combination_id(
                 library_data_t::real_float, library_data_t::real_float,
             library_data_t::real_float, library_data_t::real_float):
-            {
+        {
                 detail::gemm_batch_impl<float, float, float, float>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
                     batch_size);
                 break;
             }
-            case detail::get_type_combination_id(
-                library_data_t::real_double, library_data_t::real_double,
-            library_data_t::real_double, library_data_t::real_double):
-            {
-                detail::gemm_batch_impl<double, double, double, double>(
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+        library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_batch_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+        library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                std::complex<float>, std::complex<float>>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
                     batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-            {
-                detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                    std::complex<float>, std::complex<float>>(
-                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                        batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-            {
-                detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                    std::complex<double>, std::complex<double>>(
-                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                        batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_half):
-            {
-                detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                        a, lda, b, ldb, beta, c, ldc,
-                        batch_size);
-                break;
-            }
-#ifdef __INTEL_MKL__
-            case detail::get_type_combination_id(
-                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_bfloat16, library_data_t::real_float):
-            {
-                detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                    oneapi::mkl::bfloat16, float>(
-                        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                        batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_float, library_data_t::real_float):
-            {
-                detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                        b, ldb, beta, c, ldc, batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_int32, library_data_t::real_int32):
-            {
-                float alpha_float =
-                    dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-                float beta_float =
-                    dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-                detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                    float>(q, a_trans, b_trans, m, n, k, &alpha_float,
-                        a, lda, b, ldb, &beta_float, c, ldc,
-                        batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_float, library_data_t::real_float):
-            {
-                detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+        library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                std::complex<double>, std::complex<double>>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
                     batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_float, library_data_t::real_float):
-            {
-                detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                    a, lda, b, ldb, beta, c, ldc,
+                    batch_size);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                oneapi::mkl::bfloat16, float>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
                     batch_size);
-                break;
-            }
-#endif
-            case detail::get_type_combination_id(
-                library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_float):
-            {
-                float alpha_value =
-                    dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-                float beta_value =
-                    dpct::get_value(reinterpret_cast<const float *>(beta), q);
-                sycl::half alpha_half(alpha_value);
-                sycl::half beta_half(beta_value);
-                detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-                    q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                    b, ldb, beta, c, ldc, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_int32, library_data_t::real_int32):
+        {
+            float alpha_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+            float beta_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                float>(q, a_trans, b_trans, m, n, k, &alpha_float,
+                    a, lda, b, ldb, &beta_float, c, ldc,
                     batch_size);
-                break;
-            }
-            default:
-                throw std::runtime_error("the combination of data type is unsupported");
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+#endif
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
+                batch_size);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
         }
     }
 
@@ -2564,118 +2528,118 @@ namespace dpct
             case detail::get_type_combination_id(
                 library_data_t::real_float, library_data_t::real_float,
             library_data_t::real_float, library_data_t::real_float):
-            {
+        {
                 detail::gemm_batch_impl<float, float, float, float>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
                     beta, c, ldc, stride_c, batch_size);
                 break;
             }
-            case detail::get_type_combination_id(
-                library_data_t::real_double, library_data_t::real_double,
-            library_data_t::real_double, library_data_t::real_double):
-            {
-                detail::gemm_batch_impl<double, double, double, double>(
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+        library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_batch_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+        library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                std::complex<float>, std::complex<float>>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
                     beta, c, ldc, stride_c, batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-            {
-                detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                    std::complex<float>, std::complex<float>>(
-                        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                        beta, c, ldc, stride_c, batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-            {
-                detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                    std::complex<double>, std::complex<double>>(
-                        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                        beta, c, ldc, stride_c, batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_half):
-            {
-                detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                        a, lda, stride_a, b, ldb, stride_b,
-                        beta, c, ldc, stride_c, batch_size);
-                break;
-            }
-#ifdef __INTEL_MKL__
-            case detail::get_type_combination_id(
-                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_bfloat16, library_data_t::real_float):
-            {
-                detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                    oneapi::mkl::bfloat16, float>(
-                        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                        beta, c, ldc, stride_c, batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_float, library_data_t::real_float):
-            {
-                detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                        stride_a, b, ldb, stride_b, beta, c, ldc,
-                        stride_c, batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_int32, library_data_t::real_int32):
-            {
-                detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                    std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
-                        a, lda, stride_a, b, ldb, stride_b,
-                        beta, c, ldc, stride_c, batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_float, library_data_t::real_float):
-            {
-                detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+        library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                std::complex<double>, std::complex<double>>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
                     beta, c, ldc, stride_c, batch_size);
-                break;
-            }
-            case detail::get_type_combination_id(
-                library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_float, library_data_t::real_float):
-            {
-                detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                    a, lda, stride_a, b, ldb, stride_b,
+                    beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                oneapi::mkl::bfloat16, float>(
                     q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
                     beta, c, ldc, stride_c, batch_size);
-                break;
-            }
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                    stride_a, b, ldb, stride_b, beta, c, ldc,
+                    stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_int32, library_data_t::real_int32):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
+                    a, lda, stride_a, b, ldb, stride_b,
+                    beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
 #endif
-            case detail::get_type_combination_id(
-                library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_float):
-            {
-                float alpha_value =
-                    dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-                float beta_value =
-                    dpct::get_value(reinterpret_cast<const float *>(beta), q);
-                sycl::half alpha_half(alpha_value);
-                sycl::half beta_half(beta_value);
-                detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-                    q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
-                    &beta_half, c, ldc, stride_c, batch_size);
-                break;
-            }
-            default:
-                throw std::runtime_error("the combination of data type is unsupported");
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
+                &beta_half, c, ldc, stride_c, batch_size);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
         }
     }
 
@@ -2900,7 +2864,7 @@ namespace dpct
                 typename detail::memory_traits<Memory, T>::template accessor_t<0>;
 
             /// Constructor with initial value.
-            device_memory(const value_t &val) : base(sycl::range<1>(1), { val }) {}
+            device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {}
 
             /// Default constructor
             device_memory() : base(1) {}

From 8a3d501cdac828ec40311a02fa9236ac58aced2c Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 19 Jun 2024 08:26:06 +0000
Subject: [PATCH 05/11] revert format

---
 ggml-sycl/dpct/helper.hpp | 1247 +++++++++++++++++++------------------
 1 file changed, 624 insertions(+), 623 deletions(-)

diff --git a/ggml-sycl/dpct/helper.hpp b/ggml-sycl/dpct/helper.hpp
index 6275993261792..1ff297218c685 100644
--- a/ggml-sycl/dpct/helper.hpp
+++ b/ggml-sycl/dpct/helper.hpp
@@ -91,22 +91,22 @@ namespace dpct
 
     /// SYCL default exception handler
     inline auto exception_handler = [](sycl::exception_list exceptions)
+    {
+        for (std::exception_ptr const &e : exceptions)
         {
-            for (std::exception_ptr const &e : exceptions)
+            try
             {
-                try
-                {
-                    std::rethrow_exception(e);
-                }
-                catch (sycl::exception const &e)
-                {
-                    std::cerr << "Caught asynchronous SYCL exception:" << std::endl
-                        << e.what() << std::endl
-                        << "Exception caught at file:" << __FILE__
-                        << ", line:" << __LINE__ << std::endl;
-                }
+                std::rethrow_exception(e);
             }
-        };
+            catch (sycl::exception const &e)
+            {
+                std::cerr << "Caught asynchronous SYCL exception:" << std::endl
+                          << e.what() << std::endl
+                          << "Exception caught at file:" << __FILE__
+                          << ", line:" << __LINE__ << std::endl;
+            }
+        }
+    };
 
     enum error_code
     {
@@ -206,23 +206,23 @@ namespace dpct
             ver = dev.get_info<sycl::info::device::version>();
             std::string::size_type i = 0;
             while (i < ver.size()) {
-                if (isdigit(ver[i]))
-                    break;
-                i++;
+              if (isdigit(ver[i]))
+                break;
+              i++;
             }
             major = std::stoi(&(ver[i]));
             while (i < ver.size()) {
-                if (ver[i] == '.')
-                    break;
-                i++;
+              if (ver[i] == '.')
+                break;
+              i++;
             }
             if (i < ver.size()) {
-                // a. and b.
-                i++;
-                minor = std::stoi(&(ver[i]));
+              // a. and b.
+              i++;
+              minor = std::stoi(&(ver[i]));
             } else {
-                // c.
-                minor = 0;
+              // c.
+              minor = 0;
             }
         }
 
@@ -240,7 +240,7 @@ namespace dpct
 
     } // namespace detail
 
-      /// Pitched 2D/3D memory data.
+    /// Pitched 2D/3D memory data.
     class pitched_data
     {
     public:
@@ -272,30 +272,30 @@ namespace dpct
         const char *get_name() const { return _name; }
         char *get_name() { return _name; }
         template <typename WorkItemSizesTy = sycl::range<3>,
-            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-            std::is_same_v<WorkItemSizesTy, int *>,
-            int> = 0>
+                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                       std::is_same_v<WorkItemSizesTy, int *>,
+                                   int> = 0>
         auto get_max_work_item_sizes() const
         {
             if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
                 return sycl::range<3>(_max_work_item_sizes_i[0],
-                    _max_work_item_sizes_i[1],
-                    _max_work_item_sizes_i[2]);
+                                      _max_work_item_sizes_i[1],
+                                      _max_work_item_sizes_i[2]);
             else
             {
                 return _max_work_item_sizes_i;
             }
         }
         template <typename WorkItemSizesTy = sycl::range<3>,
-            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-            std::is_same_v<WorkItemSizesTy, int *>,
-            int> = 0>
+                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                       std::is_same_v<WorkItemSizesTy, int *>,
+                                   int> = 0>
         auto get_max_work_item_sizes()
         {
             if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
                 return sycl::range<3>(_max_work_item_sizes_i[0],
-                    _max_work_item_sizes_i[1],
-                    _max_work_item_sizes_i[2]);
+                                      _max_work_item_sizes_i[1],
+                                      _max_work_item_sizes_i[2]);
             else
             {
                 return _max_work_item_sizes_i;
@@ -318,9 +318,9 @@ namespace dpct
             return _max_register_size_per_work_group;
         }
         template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-            std::is_same_v<NDRangeSizeTy, int *>,
-            int> = 0>
+                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                       std::is_same_v<NDRangeSizeTy, int *>,
+                                   int> = 0>
         auto get_max_nd_range_size() const
         {
             if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
@@ -329,9 +329,9 @@ namespace dpct
                 return _max_nd_range_size_i;
         }
         template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-            std::is_same_v<NDRangeSizeTy, int *>,
-            int> = 0>
+                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                       std::is_same_v<NDRangeSizeTy, int *>,
+                                   int> = 0>
         auto get_max_nd_range_size()
         {
             if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
@@ -376,7 +376,7 @@ namespace dpct
                 _max_work_item_sizes_i[i] = max_work_item_sizes[i];
         }
         [[deprecated]] void
-            set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
+        set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
         {
             for (int i = 0; i < 3; ++i)
             {
@@ -416,7 +416,7 @@ namespace dpct
             _max_sub_group_size = max_sub_group_size;
         }
         void
-            set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
+        set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
         {
             _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
         }
@@ -437,7 +437,7 @@ namespace dpct
             _memory_bus_width = memory_bus_width;
         }
         void
-            set_max_register_size_per_work_group(int max_register_size_per_work_group)
+        set_max_register_size_per_work_group(int max_register_size_per_work_group)
         {
             _max_register_size_per_work_group = max_register_size_per_work_group;
         }
@@ -556,13 +556,13 @@ namespace dpct
         Use 3200000 kHz as memory_clock_rate default value. \
         Use 64 bits as memory_bus_width default value.")
 #else
-        #warning "get_device_info: querying memory_clock_rate and \
+#warning "get_device_info: querying memory_clock_rate and \
         memory_bus_width are not supported by the compiler used. \
         Use 3200000 kHz as memory_clock_rate default value. \
         Use 64 bits as memory_bus_width default value."
 #endif
 
-            size_t max_sub_group_size = 1;
+        size_t max_sub_group_size = 1;
         std::vector<size_t> sub_group_sizes =
             dev.get_info<sycl::info::device::sub_group_sizes>();
 
@@ -588,221 +588,222 @@ namespace dpct
         out = prop;
     }
 
-    /// dpct device extension
+   /// dpct device extension
     class device_ext : public sycl::device {
-        typedef std::mutex mutex_type;
-
-    public:
-        device_ext() : sycl::device() {}
-        ~device_ext() {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            clear_queues();
-        }
-        device_ext(const sycl::device &base) : sycl::device(base) {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            init_queues();
-        }
-
-        int is_native_atomic_supported() { return 0; }
-        int get_major_version() const { return dpct::get_major_version(*this); }
-
-        int get_minor_version() const { return dpct::get_minor_version(*this); }
-
-        int get_max_compute_units() const {
-            return get_device_info().get_max_compute_units();
-        }
-
-        /// Return the maximum clock frequency of this device in KHz.
-        int get_max_clock_frequency() const {
-            return get_device_info().get_max_clock_frequency();
-        }
-
-        int get_integrated() const { return get_device_info().get_integrated(); }
-
-        int get_max_sub_group_size() const {
-            return get_device_info().get_max_sub_group_size();
-        }
-
-        int get_max_register_size_per_work_group() const {
-            return get_device_info().get_max_register_size_per_work_group();
-        }
-
-        int get_max_work_group_size() const {
-            return get_device_info().get_max_work_group_size();
-        }
-
-        int get_mem_base_addr_align() const {
-            return get_info<sycl::info::device::mem_base_addr_align>();
-        }
-
-        size_t get_global_mem_size() const {
-            return get_device_info().get_global_mem_size();
-        }
-
-        size_t get_max_mem_alloc_size() const {
-            return get_device_info().get_max_mem_alloc_size();
-        }
-
-        /// Get the number of bytes of free and total memory on the SYCL device.
-        /// \param [out] free_memory The number of bytes of free memory on the
-        /// SYCL device. \param [out] total_memory The number of bytes of total
-        /// memory on the SYCL device.
-        void get_memory_info(size_t &free_memory, size_t &total_memory) {
-            total_memory = get_device_info().get_global_mem_size();
-            const char *warning_info =
-                "get_memory_info: [warning] ext_intel_free_memory is not "
-                "supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
-                "use total memory as free memory";
+      typedef std::mutex mutex_type;
+
+     public:
+      device_ext() : sycl::device() {}
+      ~device_ext() {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        clear_queues();
+      }
+      device_ext(const sycl::device &base) : sycl::device(base) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        init_queues();
+      }
+
+      int is_native_atomic_supported() { return 0; }
+      int get_major_version() const { return dpct::get_major_version(*this); }
+
+      int get_minor_version() const { return dpct::get_minor_version(*this); }
+
+      int get_max_compute_units() const {
+        return get_device_info().get_max_compute_units();
+      }
+
+      /// Return the maximum clock frequency of this device in KHz.
+      int get_max_clock_frequency() const {
+        return get_device_info().get_max_clock_frequency();
+      }
+
+      int get_integrated() const { return get_device_info().get_integrated(); }
+
+      int get_max_sub_group_size() const {
+        return get_device_info().get_max_sub_group_size();
+      }
+
+      int get_max_register_size_per_work_group() const {
+        return get_device_info().get_max_register_size_per_work_group();
+      }
+
+      int get_max_work_group_size() const {
+        return get_device_info().get_max_work_group_size();
+      }
+
+      int get_mem_base_addr_align() const {
+        return get_info<sycl::info::device::mem_base_addr_align>();
+      }
+
+      size_t get_global_mem_size() const {
+        return get_device_info().get_global_mem_size();
+      }
+
+      size_t get_max_mem_alloc_size() const {
+        return get_device_info().get_max_mem_alloc_size();
+      }
+
+      /// Get the number of bytes of free and total memory on the SYCL device.
+      /// \param [out] free_memory The number of bytes of free memory on the
+      /// SYCL device. \param [out] total_memory The number of bytes of total
+      /// memory on the SYCL device.
+      void get_memory_info(size_t &free_memory, size_t &total_memory) {
+        total_memory = get_device_info().get_global_mem_size();
+        const char *warning_info =
+            "get_memory_info: [warning] ext_intel_free_memory is not "
+            "supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
+            "use total memory as free memory";
 #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
-            if (!has(sycl::aspect::ext_intel_free_memory)) {
-                std::cerr << warning_info << std::endl;
-                free_memory = total_memory;
-            } else {
-                free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
-            }
+        if (!has(sycl::aspect::ext_intel_free_memory)) {
+          std::cerr << warning_info << std::endl;
+          free_memory = total_memory;
+        } else {
+          free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
+        }
 #else
-            std::cerr << warning_info << std::endl;
-            free_memory = total_memory;
+        std::cerr << warning_info << std::endl;
+        free_memory = total_memory;
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma message("Querying the number of bytes of free memory is not supported")
 #else
-            #warning "Querying the number of bytes of free memory is not supported"
+#warning "Querying the number of bytes of free memory is not supported"
 #endif
 #endif
-        }
-
-        void get_device_info(device_info &out) const {
-            dpct::get_device_info(out, *this);
-        }
-
-        device_info get_device_info() const {
-            device_info prop;
-            dpct::get_device_info(prop, *this);
-            return prop;
-        }
-
-        void reset() {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            clear_queues();
-            init_queues();
-        }
-
-        sycl::queue &in_order_queue() { return _q_in_order; }
-
-        sycl::queue &out_of_order_queue() { return _q_out_of_order; }
+      }
 
-        sycl::queue &default_queue() { return in_order_queue(); }
+      void get_device_info(device_info &out) const {
+        dpct::get_device_info(out, *this);
+      }
 
-        void queues_wait_and_throw() {
-            std::unique_lock<mutex_type> lock(m_mutex);
-            lock.unlock();
-            for (auto &q : _queues) {
-                q.wait_and_throw();
-            }
-            // Guard the destruct of current_queues to make sure the ref count is
-            // safe.
-            lock.lock();
-        }
-
-        sycl::queue create_queue(bool enable_exception_handler = false) {
-            return create_in_order_queue(enable_exception_handler);
-        }
-
-        sycl::queue create_queue(sycl::device device,
-            bool enable_exception_handler = false) {
-            return create_in_order_queue(device, enable_exception_handler);
-        }
-
-        sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return create_queue_impl(enable_exception_handler,
-                sycl::property::queue::in_order());
-        }
-
-        sycl::queue create_in_order_queue(sycl::device device,
-            bool enable_exception_handler = false) {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return create_queue_impl(device, enable_exception_handler,
-                sycl::property::queue::in_order());
-        }
-
-        sycl::queue create_out_of_order_queue(
-            bool enable_exception_handler = false) {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return create_queue_impl(enable_exception_handler);
-        }
-
-        void destroy_queue(sycl::queue queue) {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            _queues.clear();
-        }
-        void set_saved_queue(sycl::queue q) {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            _saved_queue = q;
-        }
-        sycl::queue get_saved_queue() const {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return _saved_queue;
-        }
-
-    private:
-        void clear_queues() { _queues.clear(); }
-
-        void init_queues() {
-            _q_in_order =
-                create_queue_impl(true, sycl::property::queue::in_order());
-            _q_out_of_order = create_queue_impl(true);
-            _saved_queue = default_queue();
-        }
-
-        /// Caller should acquire resource \p m_mutex before calling this
-        /// function.
-        template <class... Properties>
-        sycl::queue create_queue_impl(bool enable_exception_handler,
-            Properties... properties) {
-            sycl::async_handler eh = {};
-            if (enable_exception_handler) {
-                eh = exception_handler;
-            }
-            auto q = sycl::queue(*this, eh,
-                sycl::property_list(
+      device_info get_device_info() const {
+        device_info prop;
+        dpct::get_device_info(prop, *this);
+        return prop;
+      }
+
+      void reset() {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        clear_queues();
+        init_queues();
+      }
+
+      sycl::queue &in_order_queue() { return _q_in_order; }
+
+      sycl::queue &out_of_order_queue() { return _q_out_of_order; }
+
+      sycl::queue &default_queue() { return in_order_queue(); }
+
+      void queues_wait_and_throw() {
+        std::unique_lock<mutex_type> lock(m_mutex);
+        lock.unlock();
+        for (auto &q : _queues) {
+          q.wait_and_throw();
+        }
+        // Guard the destruct of current_queues to make sure the ref count is
+        // safe.
+        lock.lock();
+      }
+
+      sycl::queue create_queue(bool enable_exception_handler = false) {
+        return create_in_order_queue(enable_exception_handler);
+      }
+
+      sycl::queue create_queue(sycl::device device,
+                               bool enable_exception_handler = false) {
+        return create_in_order_queue(device, enable_exception_handler);
+      }
+
+      sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        return create_queue_impl(enable_exception_handler,
+                                 sycl::property::queue::in_order());
+      }
+
+      sycl::queue create_in_order_queue(sycl::device device,
+                                        bool enable_exception_handler = false) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        return create_queue_impl(device, enable_exception_handler,
+                                 sycl::property::queue::in_order());
+      }
+
+      sycl::queue create_out_of_order_queue(
+          bool enable_exception_handler = false) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        return create_queue_impl(enable_exception_handler);
+      }
+
+      void destroy_queue(sycl::queue queue) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        _queues.clear();
+      }
+      void set_saved_queue(sycl::queue q) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        _saved_queue = q;
+      }
+      sycl::queue get_saved_queue() const {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        return _saved_queue;
+      }
+
+     private:
+      void clear_queues() { _queues.clear(); }
+
+      void init_queues() {
+        _q_in_order =
+            create_queue_impl(true, sycl::property::queue::in_order());
+        _q_out_of_order = create_queue_impl(true);
+        _saved_queue = default_queue();
+      }
+
+      /// Caller should acquire resource \p m_mutex before calling this
+      /// function.
+      template <class... Properties>
+      sycl::queue create_queue_impl(bool enable_exception_handler,
+                                    Properties... properties) {
+        sycl::async_handler eh = {};
+        if (enable_exception_handler) {
+          eh = exception_handler;
+        }
+        auto q = sycl::queue(*this, eh,
+                             sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
-                    sycl::property::queue::enable_profiling(),
+                                 sycl::property::queue::enable_profiling(),
 #endif
-                    properties...));
-            _queues.push_back(q);
-
-            return _queues.back();
-        }
-
-        template <class... Properties>
-        sycl::queue create_queue_impl(sycl::device device,
-            bool enable_exception_handler,
-            Properties... properties) {
-            sycl::async_handler eh = {};
-            if (enable_exception_handler) {
-                eh = exception_handler;
-            }
-            _queues.push_back(
-                sycl::queue(device, eh,
-                    sycl::property_list(
+                                 properties...));
+        _queues.push_back(q);
+
+        return _queues.back();
+      }
+
+      template <class... Properties>
+      sycl::queue create_queue_impl(sycl::device device,
+                                    bool enable_exception_handler,
+                                    Properties... properties) {
+        sycl::async_handler eh = {};
+        if (enable_exception_handler) {
+          eh = exception_handler;
+        }
+        _queues.push_back(
+            sycl::queue(device, eh,
+                        sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
-                        sycl::property::queue::enable_profiling(),
+                            sycl::property::queue::enable_profiling(),
 #endif
-                        properties...)));
-
-            return _queues.back();
-        }
-
-        void get_version(int &major, int &minor) const {
-            detail::get_version(*this, major, minor);
-        }
-        sycl::queue _q_in_order, _q_out_of_order;
-        sycl::queue _saved_queue;
-        std::vector<sycl::queue> _queues;
-        mutable mutex_type m_mutex;
+                            properties...)));
+
+        return _queues.back();
+      }
+
+      void get_version(int &major, int &minor) const {
+        detail::get_version(*this, major, minor);
+      }
+      sycl::queue _q_in_order, _q_out_of_order;
+      sycl::queue _saved_queue;
+      std::vector<sycl::queue> _queues;
+      mutable mutex_type m_mutex;
     };
 
+
     /// device manager
     class dev_mgr
     {
@@ -868,7 +869,7 @@ namespace dpct
         template <class DeviceSelector>
         std::enable_if_t<
             std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-            select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
+        select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
         {
             sycl::device selected_device = sycl::device(selector);
             unsigned int selected_device_id = get_device_id(selected_device);
@@ -1000,7 +1001,7 @@ namespace dpct
         };
 
         static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
-            const void *ptr)
+                                                              const void *ptr)
         {
             switch (sycl::get_pointer_type(ptr, q.get_context()))
             {
@@ -1018,19 +1019,19 @@ namespace dpct
         inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
         {
             static_assert((unsigned char)library_data_t::library_data_t_size <=
-                std::numeric_limits<unsigned char>::max() &&
-                "library_data_t size exceeds limit.");
+                              std::numeric_limits<unsigned char>::max() &&
+                          "library_data_t size exceeds limit.");
             static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
             return (std::uint64_t)Val;
         }
 
         template <typename FirstT, typename... RestT>
         inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
-            RestT... RestVal)
+                                                               RestT... RestVal)
         {
             static_assert((std::uint8_t)library_data_t::library_data_t_size <=
-                std::numeric_limits<unsigned char>::max() &&
-                "library_data_t size exceeds limit.");
+                              std::numeric_limits<unsigned char>::max() &&
+                          "library_data_t size exceeds limit.");
             static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
             static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
             return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
@@ -1044,7 +1045,7 @@ namespace dpct
 #if defined(__linux__)
                 mapped_address_space =
                     (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
-                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+                                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 #elif defined(_WIN64)
                 mapped_address_space = (byte_t *)VirtualAlloc(
                     NULL,               // NULL specified as the base address parameter
@@ -1129,7 +1130,7 @@ namespace dpct
             {
                 std::lock_guard<std::mutex> lock(m_mutex);
                 return (mapped_address_space <= ptr) &&
-                    (ptr < mapped_address_space + mapped_region_size);
+                       (ptr < mapped_address_space + mapped_region_size);
             }
 
             /// Returns the instance of memory manager singleton.
@@ -1180,7 +1181,7 @@ namespace dpct
                 sycl::access::target::device;
             static constexpr sycl::access_mode mode =
                 (Memory == constant) ? sycl::access_mode::read
-                : sycl::access_mode::read_write;
+                                     : sycl::access_mode::read_write;
             static constexpr size_t type_size = sizeof(T);
             using element_t =
                 typename std::conditional<Memory == constant, const T, T>::type;
@@ -1199,41 +1200,41 @@ namespace dpct
 
 #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
         static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
-            sycl::queue &q)
+                                        sycl::queue &q)
         {
             pitch = PITCH_DEFAULT_ALIGN(x);
             return dpct_malloc(pitch * y * z, q);
         }
 
         /**
-        * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
-        * @tparam valueT The type of the element to be set.
-        * @param [in] q The queue in which the operation is done.
-        * @param [in] dev_ptr Pointer to the virtual device memory address.
-        * @param [in] value The value to be set.
-        * @param [in] size Number of elements to be set to the value.
-        * @return An event representing the memset operation.
-        */
+         * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] dev_ptr Pointer to the virtual device memory address.
+         * @param [in] value The value to be set.
+         * @param [in] size Number of elements to be set to the value.
+         * @return An event representing the memset operation.
+         */
         template <typename valueT>
         static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
-            valueT value, size_t size)
+                                              valueT value, size_t size)
         {
             return q.fill(dev_ptr, value, size);
         }
 
         /**
-        * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
-        * @tparam valueT The type of the element to be set.
-        * @param [in] q The queue in which the operation is done.
-        * @param [in] data Pointer to the pitched device memory region.
-        * @param [in] value The value to be set.
-        * @param [in] size 3D memory region by number of elements.
-        * @return An event list representing the memset operations.
-        */
+         * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] data Pointer to the pitched device memory region.
+         * @param [in] value The value to be set.
+         * @param [in] size 3D memory region by number of elements.
+         * @return An event list representing the memset operations.
+         */
         template <typename valueT>
         static inline std::vector<sycl::event>
-            dpct_memset(sycl::queue &q, pitched_data data, valueT value,
-                sycl::range<3> size)
+        dpct_memset(sycl::queue &q, pitched_data data, valueT value,
+                    sycl::range<3> size)
         {
             std::vector<sycl::event> event_list;
             size_t slice = data.get_pitch() * data.get_y();
@@ -1252,28 +1253,28 @@ namespace dpct
         }
 
         /**
-        * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
-        * @tparam valueT The type of the element to be set.
-        * @param [in] q The queue in which the operation is done.
-        * @param [in] ptr Pointer to the virtual device memory.
-        * @param [in] pitch The pitch size by number of elements, including padding.
-        * @param [in] val The value to be set.
-        * @param [in] x The width of memory region by number of elements.
-        * @param [in] y The height of memory region by number of elements.
-        * @return An event list representing the memset operations.
-        */
+         * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] ptr Pointer to the virtual device memory.
+         * @param [in] pitch The pitch size by number of elements, including padding.
+         * @param [in] val The value to be set.
+         * @param [in] x The width of memory region by number of elements.
+         * @param [in] y The height of memory region by number of elements.
+         * @return An event list representing the memset operations.
+         */
         template <typename valueT>
         static inline std::vector<sycl::event>
-            dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
-                size_t y)
+        dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
+                    size_t y)
         {
             return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
-                sycl::range<3>(x, y, 1));
+                               sycl::range<3>(x, y, 1));
         }
 
         static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
-            const void *from_ptr,
-            memcpy_direction dir)
+                                                        const void *from_ptr,
+                                                        memcpy_direction dir)
         {
             switch (dir)
             {
@@ -1283,20 +1284,20 @@ namespace dpct
             case memcpy_direction::device_to_device:
                 return dir;
             case memcpy_direction::automatic:
-                {
+            {
                 // table[to_attribute][from_attribute]
                 static const memcpy_direction
                     direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
-                    [static_cast<unsigned>(pointer_access_attribute::end)] =
-                {{memcpy_direction::host_to_host,
-                    memcpy_direction::device_to_host,
-                    memcpy_direction::host_to_host},
-                    {memcpy_direction::host_to_device,
-                    memcpy_direction::device_to_device,
-                    memcpy_direction::device_to_device},
-                    {memcpy_direction::host_to_host,
-                    memcpy_direction::device_to_device,
-                    memcpy_direction::device_to_device}};
+                                   [static_cast<unsigned>(pointer_access_attribute::end)] =
+                                       {{memcpy_direction::host_to_host,
+                                         memcpy_direction::device_to_host,
+                                         memcpy_direction::host_to_host},
+                                        {memcpy_direction::host_to_device,
+                                         memcpy_direction::device_to_device,
+                                         memcpy_direction::device_to_device},
+                                        {memcpy_direction::host_to_host,
+                                         memcpy_direction::device_to_device,
+                                         memcpy_direction::device_to_device}};
                 return direction_table[static_cast<unsigned>(get_pointer_attribute(
                     q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
             }
@@ -1306,9 +1307,9 @@ namespace dpct
         }
 
         static sycl::event
-            dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-                memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+                    memcpy_direction direction,
+                    const std::vector<sycl::event> &dep_events = {})
         {
             if (!size)
                 return sycl::event{};
@@ -1318,13 +1319,13 @@ namespace dpct
 
         // Get actual copy range and make sure it will not exceed range.
         static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-            size_t pitch)
+                                            size_t pitch)
         {
             return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
         }
 
         static inline size_t get_offset(sycl::id<3> id, size_t slice,
-            size_t pitch)
+                                        size_t pitch)
         {
             return slice * id.get(2) + pitch * id.get(1) + id.get(0);
         }
@@ -1332,11 +1333,11 @@ namespace dpct
         /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
         /// and \p from_range to another specified by \p to_ptr and \p to_range.
         static inline std::vector<sycl::event>
-            dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                sycl::range<3> to_range, sycl::range<3> from_range,
-                sycl::id<3> to_id, sycl::id<3> from_id,
-                sycl::range<3> size, memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                    sycl::range<3> to_range, sycl::range<3> from_range,
+                    sycl::id<3> to_id, sycl::id<3> from_id,
+                    sycl::range<3> size, memcpy_direction direction,
+                    const std::vector<sycl::event> &dep_events = {})
         {
             // RAII for host pointer
             class host_buffer
@@ -1348,7 +1349,7 @@ namespace dpct
 
             public:
                 host_buffer(size_t size, sycl::queue &q,
-                    const std::vector<sycl::event> &deps)
+                            const std::vector<sycl::event> &deps)
                     : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
                 void *get_ptr() const { return _buf; }
                 size_t get_size() const { return _size; }
@@ -1357,16 +1358,16 @@ namespace dpct
                     if (_buf)
                     {
                         _q.submit([&](sycl::handler &cgh)
-                            {
-                                cgh.depends_on(_deps);
-                                cgh.host_task([buf = _buf] { std::free(buf); }); });
+                                  {
+        cgh.depends_on(_deps);
+        cgh.host_task([buf = _buf] { std::free(buf); }); });
                     }
                 }
             };
             std::vector<sycl::event> event_list;
 
             size_t to_slice = to_range.get(1) * to_range.get(0),
-                from_slice = from_range.get(1) * from_range.get(0);
+                   from_slice = from_range.get(1) * from_range.get(0);
             unsigned char *to_surface =
                 (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
             const unsigned char *from_surface =
@@ -1376,7 +1377,7 @@ namespace dpct
             if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
             {
                 return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                    direction, dep_events)};
+                                    direction, dep_events)};
             }
             direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
             size_t size_slice = size.get(1) * size.get(0);
@@ -1391,14 +1392,14 @@ namespace dpct
                         to_range.get(0) == size.get(0))
                     {
                         event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                            direction, dep_events));
+                                                         direction, dep_events));
                     }
                     else
                     {
                         for (size_t y = 0; y < size.get(1); ++y)
                         {
                             event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                direction, dep_events));
+                                                             direction, dep_events));
                             to_ptr += to_range.get(0);
                             from_ptr += from_range.get(0);
                         }
@@ -1408,17 +1409,17 @@ namespace dpct
                 }
                 break;
             case host_to_device:
-                {
+            {
                 host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                    event_list);
+                                event_list);
                 std::vector<sycl::event> host_events;
                 if (to_slice == size_slice)
                 {
                     // Copy host data to a temp host buffer with the shape of target.
                     host_events =
                         dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                            sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                            host_to_host, dep_events);
+                                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                                    host_to_host, dep_events);
                 }
                 else
                 {
@@ -1429,39 +1430,39 @@ namespace dpct
                         // If has padding data, not sure whether it is useless. So fill temp
                         // buffer with it.
                         std::vector<sycl::event>{
-                        dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                            device_to_host, dep_events)});
+                            dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                                        device_to_host, dep_events)});
                 }
                 // Copy from temp host buffer to device with only one submit.
                 event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                    buf.get_size(), host_to_device,
-                    host_events));
+                                                 buf.get_size(), host_to_device,
+                                                 host_events));
                 break;
             }
             case device_to_host:
-                {
+            {
                 host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                    event_list);
+                                event_list);
                 // Copy from host temp buffer to host target with reshaping.
                 event_list = dpct_memcpy(
                     q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
                     sycl::id<3>(0, 0, 0), size, host_to_host,
                     // Copy from device to temp host buffer with only one submit.
                     std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                        buf.get_size(),
-                        device_to_host, dep_events)});
+                                                         buf.get_size(),
+                                                         device_to_host, dep_events)});
                 break;
             }
             case device_to_device:
                 event_list.push_back(q.submit([&](sycl::handler &cgh){
-                    cgh.depends_on(dep_events);
-                    cgh.parallel_for<class dpct_memcpy_3d_detail>(
-                        size,
-                        [=](sycl::id<3> id) {
-                            to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                                from_surface[get_offset(id, from_slice, from_range.get(0))];
-                        }); }));
-                    break;
+                cgh.depends_on(dep_events);
+                cgh.parallel_for<class dpct_memcpy_3d_detail>(
+                    size,
+                    [=](sycl::id<3> id) {
+                        to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                            from_surface[get_offset(id, from_slice, from_range.get(0))];
+                    }); }));
+                break;
             default:
                 throw std::runtime_error("dpct_memcpy: invalid direction value");
             }
@@ -1470,26 +1471,26 @@ namespace dpct
 
         /// memcpy 2D/3D matrix specified by pitched_data.
         static inline std::vector<sycl::event>
-            dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-                memcpy_direction direction = automatic)
+        dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                    pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                    memcpy_direction direction = automatic)
         {
             return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                size, direction);
+                               sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                               sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                               size, direction);
         }
 
         /// memcpy 2D matrix with pitch.
         static inline std::vector<sycl::event>
-            dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-                memcpy_direction direction = automatic)
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                    size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                    memcpy_direction direction = automatic)
         {
             return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                sycl::range<3>(from_pitch, y, 1),
-                sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                sycl::range<3>(x, y, 1), direction);
+                               sycl::range<3>(from_pitch, y, 1),
+                               sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                               sycl::range<3>(x, y, 1), direction);
         }
 
         namespace deprecated
@@ -1555,7 +1556,7 @@ namespace dpct
         } // namespace deprecated
 
         inline void dpct_free(void *ptr,
-            const sycl::queue &q)
+                              const sycl::queue &q)
         {
             if (ptr)
             {
@@ -1577,7 +1578,7 @@ namespace dpct
             Ty s_h;
             if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
                 detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host)
-                .wait();
+                    .wait();
             else
                 s_h = *reinterpret_cast<const Ty *>(s);
             return s_h;
@@ -1595,9 +1596,9 @@ namespace dpct
     {
         template <class Ta, class Tb, class Tc, class Ts>
         inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-            oneapi::mkl::transpose b_trans, int m, int n, int k,
-            const void *alpha, const void *a, int lda, const void *b,
-            int ldb, const void *beta, void *c, int ldc)
+                              oneapi::mkl::transpose b_trans, int m, int n, int k,
+                              const void *alpha, const void *a, int lda, const void *b,
+                              int ldb, const void *beta, void *c, int ldc)
         {
             Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
             Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
@@ -1638,10 +1639,10 @@ namespace dpct
 
         template <class Ta, class Tb, class Tc, class Ts>
         inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-            oneapi::mkl::transpose b_trans, int m, int n, int k,
-            const void *alpha, const void **a, int lda,
-            const void **b, int ldb, const void *beta, void **c,
-            int ldc, int batch_size)
+                                    oneapi::mkl::transpose b_trans, int m, int n, int k,
+                                    const void *alpha, const void **a, int lda,
+                                    const void **b, int ldb, const void *beta, void **c,
+                                    int ldc, int batch_size)
         {
             struct matrix_info_t
             {
@@ -1679,19 +1680,19 @@ namespace dpct
                 matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
 
             q.submit([&](sycl::handler &cgh)
-                {
-                    cgh.depends_on(e);
-                    cgh.host_task([=] { std::free(matrix_info); }); });
+                     {
+    cgh.depends_on(e);
+    cgh.host_task([=] { std::free(matrix_info); }); });
         }
 
         template <class Ta, class Tb, class Tc, class Ts>
         inline void
-            gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                oneapi::mkl::transpose b_trans, int m, int n,
-                int k, const void *alpha, const void *a, int lda,
-                long long int stride_a, const void *b, int ldb,
-                long long int stride_b, const void *beta, void *c,
-                int ldc, long long int stride_c, int batch_size)
+        gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                        oneapi::mkl::transpose b_trans, int m, int n,
+                        int k, const void *alpha, const void *a, int lda,
+                        long long int stride_a, const void *b, int ldb,
+                        long long int stride_b, const void *beta, void *c,
+                        int ldc, long long int stride_c, int batch_size)
         {
             Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
             Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
@@ -1708,7 +1709,7 @@ namespace dpct
 
     template <typename VecT, class BinaryOperation>
     inline unsigned vectorized_binary(unsigned a, unsigned b,
-        const BinaryOperation binary_op)
+                                      const BinaryOperation binary_op)
     {
         sycl::vec<unsigned, 1> v0{a}, v1{b};
         auto v2 = v0.as<VecT>();
@@ -1720,8 +1721,8 @@ namespace dpct
     }
 
     static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
-        memcpy_direction direction = automatic,
-        sycl::queue &q = dpct::get_default_queue())
+                                  memcpy_direction direction = automatic,
+                                  sycl::queue &q = dpct::get_default_queue())
     {
         detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
     }
@@ -1734,16 +1735,16 @@ namespace dpct
 
     template <typename T>
     T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
-        unsigned int logical_sub_group_size = 32)
+                               unsigned int logical_sub_group_size = 32)
     {
         unsigned int id = g.get_local_linear_id();
         unsigned int start_index =
             id / logical_sub_group_size * logical_sub_group_size;
         unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
         return sycl::select_from_group(g, x,
-            target_offset < logical_sub_group_size
-            ? start_index + target_offset
-            : id);
+                                       target_offset < logical_sub_group_size
+                                           ? start_index + target_offset
+                                           : id);
     }
 
     template <typename T>
@@ -1751,14 +1752,14 @@ namespace dpct
     {
         return sycl::vec<T, 1>(val)
             .template as<sycl::vec<
-            std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
+                std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
             .template convert<T>();
     }
 
     template <typename T1, typename T2>
     using dot_product_acc_t =
         std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-        uint32_t, int32_t>;
+                           uint32_t, int32_t>;
 
     template <typename T1, typename T2, typename T3>
     inline auto dp4a(T1 a, T2 b, T3 c)
@@ -1799,13 +1800,13 @@ namespace dpct
     inline double pow(const double a, const double b) { return sycl::pow(a, b); }
     template <typename T, typename U>
     inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
-        pow(const T a, const U b)
+    pow(const T a, const U b)
     {
         return sycl::pow(a, static_cast<T>(b));
     }
     template <typename T, typename U>
     inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
-        pow(const T a, const U b)
+    pow(const T a, const U b)
     {
         return sycl::pow(static_cast<double>(a), static_cast<double>(b));
     }
@@ -1932,8 +1933,8 @@ namespace dpct
     }
 
     inline void
-        has_capability_or_fail(const sycl::device &dev,
-            const std::initializer_list<sycl::aspect> &props)
+    has_capability_or_fail(const sycl::device &dev,
+                           const std::initializer_list<sycl::aspect> &props)
     {
         for (const auto &it : props)
         {
@@ -1943,13 +1944,13 @@ namespace dpct
             {
             case sycl::aspect::fp64:
                 throw std::runtime_error("'double' is not supported in '" +
-                    dev.get_info<sycl::info::device::name>() +
-                    "' device");
+                                         dev.get_info<sycl::info::device::name>() +
+                                         "' device");
                 break;
             case sycl::aspect::fp16:
                 throw std::runtime_error("'half' is not supported in '" +
-                    dev.get_info<sycl::info::device::name>() +
-                    "' device");
+                                         dev.get_info<sycl::info::device::name>() +
+                                         "' device");
                 break;
             default:
 #define __SYCL_ASPECT(ASPECT, ID) \
@@ -1958,15 +1959,15 @@ namespace dpct
 #define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
 #define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
                 auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
+                {
+                    switch (AspectNum)
                     {
-                        switch (AspectNum)
-                        {
 #include <sycl/info/aspects.def>
 #include <sycl/info/aspects_deprecated.def>
-                        default:
-                            return "unknown aspect";
-                        }
-                    };
+                    default:
+                        return "unknown aspect";
+                    }
+                };
 #undef __SYCL_ASPECT_DEPRECATED_ALIAS
 #undef __SYCL_ASPECT_DEPRECATED
 #undef __SYCL_ASPECT
@@ -1994,9 +1995,9 @@ namespace dpct
     }
 
     static sycl::event
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-            memcpy_direction direction,
-            const std::vector<sycl::event> &dep_events = {})
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+                memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
     {
         if (!size)
             return sycl::event{};
@@ -2006,13 +2007,13 @@ namespace dpct
 
     // Get actual copy range and make sure it will not exceed range.
     static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-        size_t pitch)
+                                        size_t pitch)
     {
         return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
     }
 
     static inline size_t get_offset(sycl::id<3> id, size_t slice,
-        size_t pitch)
+                                    size_t pitch)
     {
         return slice * id.get(2) + pitch * id.get(1) + id.get(0);
     }
@@ -2020,11 +2021,11 @@ namespace dpct
     /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
     /// and \p from_range to another specified by \p to_ptr and \p to_range.
     static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-            sycl::range<3> to_range, sycl::range<3> from_range,
-            sycl::id<3> to_id, sycl::id<3> from_id,
-            sycl::range<3> size, memcpy_direction direction,
-            const std::vector<sycl::event> &dep_events = {})
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                sycl::range<3> to_range, sycl::range<3> from_range,
+                sycl::id<3> to_id, sycl::id<3> from_id,
+                sycl::range<3> size, memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
     {
         // RAII for host pointer
         class host_buffer
@@ -2036,7 +2037,7 @@ namespace dpct
 
         public:
             host_buffer(size_t size, sycl::queue &q,
-                const std::vector<sycl::event> &deps)
+                        const std::vector<sycl::event> &deps)
                 : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
             void *get_ptr() const { return _buf; }
             size_t get_size() const { return _size; }
@@ -2045,16 +2046,16 @@ namespace dpct
                 if (_buf)
                 {
                     _q.submit([&](sycl::handler &cgh)
-                        {
-                            cgh.depends_on(_deps);
-                            cgh.host_task([buf = _buf] { std::free(buf); }); });
+                              {
+            cgh.depends_on(_deps);
+            cgh.host_task([buf = _buf] { std::free(buf); }); });
                 }
             }
         };
         std::vector<sycl::event> event_list;
 
         size_t to_slice = to_range.get(1) * to_range.get(0),
-            from_slice = from_range.get(1) * from_range.get(0);
+               from_slice = from_range.get(1) * from_range.get(0);
         unsigned char *to_surface =
             (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
         const unsigned char *from_surface =
@@ -2064,7 +2065,7 @@ namespace dpct
         if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
         {
             return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                direction, dep_events)};
+                                direction, dep_events)};
         }
         direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
         size_t size_slice = size.get(1) * size.get(0);
@@ -2079,14 +2080,14 @@ namespace dpct
                     to_range.get(0) == size.get(0))
                 {
                     event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                        direction, dep_events));
+                                                     direction, dep_events));
                 }
                 else
                 {
                     for (size_t y = 0; y < size.get(1); ++y)
                     {
                         event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                            direction, dep_events));
+                                                         direction, dep_events));
                         to_ptr += to_range.get(0);
                         from_ptr += from_range.get(0);
                     }
@@ -2096,17 +2097,17 @@ namespace dpct
             }
             break;
         case host_to_device:
-            {
+        {
             host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                event_list);
+                            event_list);
             std::vector<sycl::event> host_events;
             if (to_slice == size_slice)
             {
                 // Copy host data to a temp host buffer with the shape of target.
                 host_events =
                     dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                        sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                        host_to_host, dep_events);
+                                sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                                host_to_host, dep_events);
             }
             else
             {
@@ -2117,40 +2118,40 @@ namespace dpct
                     // If has padding data, not sure whether it is useless. So fill temp
                     // buffer with it.
                     std::vector<sycl::event>{
-                    dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                        device_to_host, dep_events)});
+                        dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                                    device_to_host, dep_events)});
             }
             // Copy from temp host buffer to device with only one submit.
             event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                buf.get_size(), host_to_device,
-                host_events));
+                                             buf.get_size(), host_to_device,
+                                             host_events));
             break;
         }
         case device_to_host:
-            {
+        {
             host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                event_list);
+                            event_list);
             // Copy from host temp buffer to host target with reshaping.
             event_list = dpct_memcpy(
                 q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
                 sycl::id<3>(0, 0, 0), size, host_to_host,
                 // Copy from device to temp host buffer with only one submit.
                 std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                    buf.get_size(),
-                    device_to_host, dep_events)});
+                                                     buf.get_size(),
+                                                     device_to_host, dep_events)});
             break;
         }
         case device_to_device:
             event_list.push_back(q.submit([&](sycl::handler &cgh)
-                {
-                    cgh.depends_on(dep_events);
-                    cgh.parallel_for<class dpct_memcpy_3d_detail>(
-                        size,
-                        [=](sycl::id<3> id) {
-                            to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                                from_surface[get_offset(id, from_slice, from_range.get(0))];
-                        }); }));
-                    break;
+                                          {
+        cgh.depends_on(dep_events);
+        cgh.parallel_for<class dpct_memcpy_3d_detail>(
+            size,
+            [=](sycl::id<3> id) {
+                to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                    from_surface[get_offset(id, from_slice, from_range.get(0))];
+            }); }));
+        break;
         default:
             throw std::runtime_error("dpct_memcpy: invalid direction value");
         }
@@ -2159,34 +2160,34 @@ namespace dpct
 
     /// memcpy 2D/3D matrix specified by pitched_data.
     static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-            pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-            memcpy_direction direction = automatic)
+    dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                memcpy_direction direction = automatic)
     {
         return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-            sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-            sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-            size, direction);
+                           sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                           sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                           size, direction);
     }
 
     /// memcpy 2D matrix with pitch.
     static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-            size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-            memcpy_direction direction = automatic)
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                memcpy_direction direction = automatic)
     {
         return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-            sycl::range<3>(from_pitch, y, 1),
-            sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-            sycl::range<3>(x, y, 1), direction);
+                           sycl::range<3>(from_pitch, y, 1),
+                           sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                           sycl::range<3>(x, y, 1), direction);
     }
 
     inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
-        oneapi::mkl::transpose b_trans, int m, int n, int k,
-        const void *alpha, const void *a, library_data_t a_type,
-        int lda, const void *b, library_data_t b_type, int ldb,
-        const void *beta, void *c, library_data_t c_type, int ldc,
-        library_data_t scaling_type)
+                     oneapi::mkl::transpose b_trans, int m, int n, int k,
+                     const void *alpha, const void *a, library_data_t a_type,
+                     int lda, const void *b, library_data_t b_type, int ldb,
+                     const void *beta, void *c, library_data_t c_type, int ldc,
+                     library_data_t scaling_type)
     {
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
@@ -2194,7 +2195,7 @@ namespace dpct
             scaling_type = library_data_t::complex_float;
         }
         else if (scaling_type == library_data_t::real_double &&
-            c_type == library_data_t::complex_double)
+                 c_type == library_data_t::complex_double)
         {
             scaling_type = library_data_t::complex_double;
         }
@@ -2203,17 +2204,17 @@ namespace dpct
             detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
         switch (key)
         {
-            case detail::get_type_combination_id(
-                library_data_t::real_float, library_data_t::real_float,
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
             library_data_t::real_float, library_data_t::real_float):
         {
-                detail::gemm_impl<float, float, float, float>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-                break;
-            }
+            detail::gemm_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
         case detail::get_type_combination_id(
             library_data_t::real_double, library_data_t::real_double,
-        library_data_t::real_double, library_data_t::real_double):
+            library_data_t::real_double, library_data_t::real_double):
         {
             detail::gemm_impl<double, double, double, double>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
@@ -2221,44 +2222,44 @@ namespace dpct
         }
         case detail::get_type_combination_id(
             library_data_t::complex_float, library_data_t::complex_float,
-        library_data_t::complex_float, library_data_t::complex_float):
+            library_data_t::complex_float, library_data_t::complex_float):
         {
             detail::gemm_impl<std::complex<float>, std::complex<float>,
-                std::complex<float>, std::complex<float>>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                              std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::complex_double, library_data_t::complex_double,
-        library_data_t::complex_double, library_data_t::complex_double):
+            library_data_t::complex_double, library_data_t::complex_double):
         {
             detail::gemm_impl<std::complex<double>, std::complex<double>,
-                std::complex<double>, std::complex<double>>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                              std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_half):
+            library_data_t::real_half, library_data_t::real_half):
         {
             detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
-                    lda, b, ldb, beta, c, ldc);
+                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
+                                          lda, b, ldb, beta, c, ldc);
             break;
         }
 #ifdef __INTEL_MKL__
         case detail::get_type_combination_id(
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_float, library_data_t::real_float):
+            library_data_t::real_float, library_data_t::real_float):
         {
             detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
-                    ldb, beta, c, ldc);
+                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
+                                     ldb, beta, c, ldc);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_float, library_data_t::real_float):
+            library_data_t::real_float, library_data_t::real_float):
         {
             detail::gemm_impl<sycl::half, sycl::half, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
@@ -2266,7 +2267,7 @@ namespace dpct
         }
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_float):
+            library_data_t::real_half, library_data_t::real_float):
         {
             float alpha_value =
                 dpct::get_value(reinterpret_cast<const float *>(alpha), q);
@@ -2275,13 +2276,13 @@ namespace dpct
             sycl::half alpha_half(alpha_value);
             sycl::half beta_half(beta_value);
             detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
-                    a, lda, b, ldb, &beta_half, c, ldc);
+                              sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
+                                          a, lda, b, ldb, &beta_half, c, ldc);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_float, library_data_t::real_float):
+            library_data_t::real_float, library_data_t::real_float):
         {
             detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
@@ -2289,16 +2290,16 @@ namespace dpct
         }
         case detail::get_type_combination_id(
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_bfloat16, library_data_t::real_float):
+            library_data_t::real_bfloat16, library_data_t::real_float):
         {
             detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                oneapi::mkl::bfloat16, float>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+                              oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_int32, library_data_t::real_int32):
+            library_data_t::real_int32, library_data_t::real_int32):
         {
             float alpha_float =
                 dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
@@ -2314,33 +2315,33 @@ namespace dpct
         }
     } // gemm()
 
-      /// Computes a batch of matrix-matrix product with general matrices.
-      /// \param [in] q The queue where the routine should be executed.
-      /// \param [in] a_trans Specifies the operation applied to A.
-      /// \param [in] b_trans Specifies the operation applied to B.
-      /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-      /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-      /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-      /// \param [in] alpha Scaling factor for the matrix-matrix product.
-      /// \param [in] a Input matrix A.
-      /// \param [in] a_type Data type of the matrix A.
-      /// \param [in] lda Leading dimension of A.
-      /// \param [in] b Input matrix B.
-      /// \param [in] b_type Data type of the matrix B.
-      /// \param [in] ldb Leading dimension of B.
-      /// \param [in] beta Scaling factor for matrix C.
-      /// \param [in, out] c Input/Output matrix C.
-      /// \param [in] c_type Data type of the matrix C.
-      /// \param [in] ldc Leading dimension of C.
-      /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-      /// \param [in] scaling_type Data type of the scaling factors.
+    /// Computes a batch of matrix-matrix product with general matrices.
+    /// \param [in] q The queue where the routine should be executed.
+    /// \param [in] a_trans Specifies the operation applied to A.
+    /// \param [in] b_trans Specifies the operation applied to B.
+    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+    /// \param [in] alpha Scaling factor for the matrix-matrix product.
+    /// \param [in] a Input matrix A.
+    /// \param [in] a_type Data type of the matrix A.
+    /// \param [in] lda Leading dimension of A.
+    /// \param [in] b Input matrix B.
+    /// \param [in] b_type Data type of the matrix B.
+    /// \param [in] ldb Leading dimension of B.
+    /// \param [in] beta Scaling factor for matrix C.
+    /// \param [in, out] c Input/Output matrix C.
+    /// \param [in] c_type Data type of the matrix C.
+    /// \param [in] ldc Leading dimension of C.
+    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+    /// \param [in] scaling_type Data type of the scaling factors.
     inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-        oneapi::mkl::transpose b_trans, int m, int n, int k,
-        const void *alpha, const void *a[],
-        library_data_t a_type, int lda, const void *b[],
-        library_data_t b_type, int ldb, const void *beta,
-        void *c[], library_data_t c_type, int ldc,
-        int batch_size, library_data_t scaling_type)
+                           oneapi::mkl::transpose b_trans, int m, int n, int k,
+                           const void *alpha, const void *a[],
+                           library_data_t a_type, int lda, const void *b[],
+                           library_data_t b_type, int ldb, const void *beta,
+                           void *c[], library_data_t c_type, int ldc,
+                           int batch_size, library_data_t scaling_type)
     {
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
@@ -2348,7 +2349,7 @@ namespace dpct
             scaling_type = library_data_t::complex_float;
         }
         else if (scaling_type == library_data_t::real_double &&
-            c_type == library_data_t::complex_double)
+                 c_type == library_data_t::complex_double)
         {
             scaling_type = library_data_t::complex_double;
         }
@@ -2357,18 +2358,18 @@ namespace dpct
             detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
         switch (key)
         {
-            case detail::get_type_combination_id(
-                library_data_t::real_float, library_data_t::real_float,
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
             library_data_t::real_float, library_data_t::real_float):
         {
-                detail::gemm_batch_impl<float, float, float, float>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                    batch_size);
-                break;
-            }
+            detail::gemm_batch_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
         case detail::get_type_combination_id(
             library_data_t::real_double, library_data_t::real_double,
-        library_data_t::real_double, library_data_t::real_double):
+            library_data_t::real_double, library_data_t::real_double):
         {
             detail::gemm_batch_impl<double, double, double, double>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
@@ -2377,71 +2378,71 @@ namespace dpct
         }
         case detail::get_type_combination_id(
             library_data_t::complex_float, library_data_t::complex_float,
-        library_data_t::complex_float, library_data_t::complex_float):
+            library_data_t::complex_float, library_data_t::complex_float):
         {
             detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                std::complex<float>, std::complex<float>>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                    batch_size);
+                                    std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::complex_double, library_data_t::complex_double,
-        library_data_t::complex_double, library_data_t::complex_double):
+            library_data_t::complex_double, library_data_t::complex_double):
         {
             detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                std::complex<double>, std::complex<double>>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                    batch_size);
+                                    std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_half):
+            library_data_t::real_half, library_data_t::real_half):
         {
             detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                    a, lda, b, ldb, beta, c, ldc,
-                    batch_size);
+                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                                a, lda, b, ldb, beta, c, ldc,
+                                                batch_size);
             break;
         }
 #ifdef __INTEL_MKL__
         case detail::get_type_combination_id(
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_bfloat16, library_data_t::real_float):
+            library_data_t::real_bfloat16, library_data_t::real_float):
         {
             detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                oneapi::mkl::bfloat16, float>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                    batch_size);
+                                    oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_float, library_data_t::real_float):
+            library_data_t::real_float, library_data_t::real_float):
         {
             detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                    b, ldb, beta, c, ldc, batch_size);
+                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                           b, ldb, beta, c, ldc, batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_int32, library_data_t::real_int32):
+            library_data_t::real_int32, library_data_t::real_int32):
         {
             float alpha_float =
                 dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
             float beta_float =
                 dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
             detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                float>(q, a_trans, b_trans, m, n, k, &alpha_float,
-                    a, lda, b, ldb, &beta_float, c, ldc,
-                    batch_size);
+                                    float>(q, a_trans, b_trans, m, n, k, &alpha_float,
+                                           a, lda, b, ldb, &beta_float, c, ldc,
+                                           batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_float, library_data_t::real_float):
+            library_data_t::real_float, library_data_t::real_float):
         {
             detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
@@ -2450,7 +2451,7 @@ namespace dpct
         }
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_float, library_data_t::real_float):
+            library_data_t::real_float, library_data_t::real_float):
         {
             detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
@@ -2460,7 +2461,7 @@ namespace dpct
 #endif
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_float):
+            library_data_t::real_half, library_data_t::real_float):
         {
             float alpha_value =
                 dpct::get_value(reinterpret_cast<const float *>(alpha), q);
@@ -2502,13 +2503,13 @@ namespace dpct
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
     inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-        oneapi::mkl::transpose b_trans, int m, int n, int k,
-        const void *alpha, const void *a, library_data_t a_type,
-        int lda, long long int stride_a, const void *b,
-        library_data_t b_type, int ldb, long long int stride_b,
-        const void *beta, void *c, library_data_t c_type,
-        int ldc, long long int stride_c, int batch_size,
-        library_data_t scaling_type)
+                           oneapi::mkl::transpose b_trans, int m, int n, int k,
+                           const void *alpha, const void *a, library_data_t a_type,
+                           int lda, long long int stride_a, const void *b,
+                           library_data_t b_type, int ldb, long long int stride_b,
+                           const void *beta, void *c, library_data_t c_type,
+                           int ldc, long long int stride_c, int batch_size,
+                           library_data_t scaling_type)
     {
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
@@ -2516,7 +2517,7 @@ namespace dpct
             scaling_type = library_data_t::complex_float;
         }
         else if (scaling_type == library_data_t::real_double &&
-            c_type == library_data_t::complex_double)
+                 c_type == library_data_t::complex_double)
         {
             scaling_type = library_data_t::complex_double;
         }
@@ -2525,18 +2526,18 @@ namespace dpct
             detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
         switch (key)
         {
-            case detail::get_type_combination_id(
-                library_data_t::real_float, library_data_t::real_float,
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
             library_data_t::real_float, library_data_t::real_float):
         {
-                detail::gemm_batch_impl<float, float, float, float>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                    beta, c, ldc, stride_c, batch_size);
-                break;
-            }
+            detail::gemm_batch_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
         case detail::get_type_combination_id(
             library_data_t::real_double, library_data_t::real_double,
-        library_data_t::real_double, library_data_t::real_double):
+            library_data_t::real_double, library_data_t::real_double):
         {
             detail::gemm_batch_impl<double, double, double, double>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
@@ -2545,68 +2546,68 @@ namespace dpct
         }
         case detail::get_type_combination_id(
             library_data_t::complex_float, library_data_t::complex_float,
-        library_data_t::complex_float, library_data_t::complex_float):
+            library_data_t::complex_float, library_data_t::complex_float):
         {
             detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                std::complex<float>, std::complex<float>>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                    beta, c, ldc, stride_c, batch_size);
+                                    std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::complex_double, library_data_t::complex_double,
-        library_data_t::complex_double, library_data_t::complex_double):
+            library_data_t::complex_double, library_data_t::complex_double):
         {
             detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                std::complex<double>, std::complex<double>>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                    beta, c, ldc, stride_c, batch_size);
+                                    std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_half):
+            library_data_t::real_half, library_data_t::real_half):
         {
             detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                    a, lda, stride_a, b, ldb, stride_b,
-                    beta, c, ldc, stride_c, batch_size);
+                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                                a, lda, stride_a, b, ldb, stride_b,
+                                                beta, c, ldc, stride_c, batch_size);
             break;
         }
 #ifdef __INTEL_MKL__
         case detail::get_type_combination_id(
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_bfloat16, library_data_t::real_float):
+            library_data_t::real_bfloat16, library_data_t::real_float):
         {
             detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                oneapi::mkl::bfloat16, float>(
-                    q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                    beta, c, ldc, stride_c, batch_size);
+                                    oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_float, library_data_t::real_float):
+            library_data_t::real_float, library_data_t::real_float):
         {
             detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                    stride_a, b, ldb, stride_b, beta, c, ldc,
-                    stride_c, batch_size);
+                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                           stride_a, b, ldb, stride_b, beta, c, ldc,
+                                           stride_c, batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_int32, library_data_t::real_int32):
+            library_data_t::real_int32, library_data_t::real_int32):
         {
             detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
-                    a, lda, stride_a, b, ldb, stride_b,
-                    beta, c, ldc, stride_c, batch_size);
+                                    std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
+                                                  a, lda, stride_a, b, ldb, stride_b,
+                                                  beta, c, ldc, stride_c, batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_float, library_data_t::real_float):
+            library_data_t::real_float, library_data_t::real_float):
         {
             detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
@@ -2615,7 +2616,7 @@ namespace dpct
         }
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_float, library_data_t::real_float):
+            library_data_t::real_float, library_data_t::real_float):
         {
             detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
@@ -2625,7 +2626,7 @@ namespace dpct
 #endif
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_float):
+            library_data_t::real_half, library_data_t::real_float):
         {
             float alpha_value =
                 dpct::get_value(reinterpret_cast<const float *>(alpha), q);
@@ -2644,13 +2645,13 @@ namespace dpct
     }
 
     static inline void
-        async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
-            size_t from_pitch, size_t x, size_t y,
-            memcpy_direction direction = automatic,
-            sycl::queue &q = get_default_queue())
+    async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
+                      size_t from_pitch, size_t x, size_t y,
+                      memcpy_direction direction = automatic,
+                      sycl::queue &q = get_default_queue())
     {
         detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
-            direction);
+                            direction);
     }
 
     using err0 = detail::generic_error_type<struct err0_tag, int>;
@@ -2717,7 +2718,7 @@ namespace dpct
         public:
             using accessor_t =
                 typename detail::memory_traits<Memory,
-                T>::template accessor_t<Dimension>;
+                                            T>::template accessor_t<Dimension>;
             using value_t = typename detail::memory_traits<Memory, T>::value_t;
             using dpct_accessor_t = dpct::accessor<T, Memory, Dimension>;
 
@@ -2725,7 +2726,7 @@ namespace dpct
 
             /// Constructor of 1-D array with initializer list
             device_memory(const sycl::range<Dimension> &in_range,
-                std::initializer_list<value_t> &&init_list)
+                        std::initializer_list<value_t> &&init_list)
                 : device_memory(in_range) {
                 assert(init_list.size() <= in_range.size());
                 _host_ptr = (value_t *)std::malloc(_size);
@@ -2746,7 +2747,7 @@ namespace dpct
                 for (auto sub_list : init_list) {
                     assert(sub_list.size() <= in_range[1]);
                     std::memcpy(tmp_data, sub_list.begin(),
-                        sub_list.size() * sizeof(T));
+                                sub_list.size() * sizeof(T));
                     tmp_data += in_range[1];
                 }
             }
@@ -2789,7 +2790,7 @@ namespace dpct
                 allocate_device(q);
                 if (_host_ptr)
                     detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size,
-                        host_to_device);
+                                        host_to_device);
             }
 
             /// The variable is assigned to a device pointer.
@@ -2821,7 +2822,7 @@ namespace dpct
             /// when usm is used and dimension is greater than 1.
             template <size_t D = Dimension>
             typename std::enable_if<D != 1, dpct_accessor_t>::type
-                get_access([[maybe_unused]] sycl::handler &cgh) {
+            get_access([[maybe_unused]] sycl::handler &cgh) {
                 return dpct_accessor_t((T *)_device_ptr, _range);
             }
 
@@ -2831,21 +2832,21 @@ namespace dpct
                 _device_ptr(memory_ptr) {}
 
             void allocate_device(sycl::queue &q) {
-#ifndef DPCT_USM_LEVEL_NONE
+        #ifndef DPCT_USM_LEVEL_NONE
                 if (Memory == shared) {
                     _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
-                        q.get_context());
+                                                                q.get_context());
                     return;
                 }
-#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
+        #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
                 if (Memory == constant) {
                     _device_ptr = (value_t *)sycl::malloc_device(
                         _size, q.get_device(), q.get_context(),
                         sycl::ext::oneapi::property::usm::device_read_only());
                     return;
                 }
-#endif
-#endif
+        #endif
+        #endif
                 _device_ptr = (value_t *)detail::dpct_malloc(_size, q);
             }
 
@@ -2869,7 +2870,7 @@ namespace dpct
             /// Default constructor
             device_memory() : base(1) {}
         };
-    } // namespace detail
+        } // namespace detail
 
     template <class T, size_t Dimension>
     using global_memory = detail::device_memory<T, global, Dimension>;
@@ -2880,54 +2881,54 @@ namespace dpct
 
 
     template <typename T,
-        sycl::access::address_space addressSpace =
-        sycl::access::address_space::global_space,
-        sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-        sycl::memory_scope memoryScope = sycl::memory_scope::device>
+            sycl::access::address_space addressSpace =
+                sycl::access::address_space::global_space,
+            sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+            sycl::memory_scope memoryScope = sycl::memory_scope::device>
     inline T atomic_fetch_add(T *addr, T operand) {
-        auto atm =
-            sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-        return atm.fetch_add(operand);
+    auto atm =
+        sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+    return atm.fetch_add(operand);
     }
 
     template <sycl::access::address_space addressSpace =
-        sycl::access::address_space::global_space,
-        sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-        sycl::memory_scope memoryScope = sycl::memory_scope::device,
-        typename T1, typename T2>
+                sycl::access::address_space::global_space,
+            sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+            sycl::memory_scope memoryScope = sycl::memory_scope::device,
+            typename T1, typename T2>
     inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
-        auto atm =
-            sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-        return atm.fetch_add(operand);
+    auto atm =
+        sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+    return atm.fetch_add(operand);
     }
 
     template <typename T, sycl::access::address_space addressSpace =
-        sycl::access::address_space::global_space>
+                            sycl::access::address_space::global_space>
     inline T atomic_fetch_add(T *addr, T operand,
-        sycl::memory_order memoryOrder) {
-        switch (memoryOrder) {
+                            sycl::memory_order memoryOrder) {
+    switch (memoryOrder) {
         case sycl::memory_order::relaxed:
             return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
-                sycl::memory_scope::device>(addr, operand);
+                                    sycl::memory_scope::device>(addr, operand);
         case sycl::memory_order::acq_rel:
             return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
-                sycl::memory_scope::device>(addr, operand);
+                                    sycl::memory_scope::device>(addr, operand);
         case sycl::memory_order::seq_cst:
             return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
-                sycl::memory_scope::device>(addr, operand);
+                                    sycl::memory_scope::device>(addr, operand);
         default:
             assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                "atomics are: sycl::memory_order::relaxed, "
-                "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+                            "atomics are: sycl::memory_order::relaxed, "
+                            "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
         }
     }
 
     template <sycl::access::address_space addressSpace =
-        sycl::access::address_space::global_space,
-        typename T1, typename T2>
+                sycl::access::address_space::global_space,
+            typename T1, typename T2>
     inline T1 atomic_fetch_add(T1 *addr, T2 operand,
-        sycl::memory_order memoryOrder) {
-        atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
+                            sycl::memory_order memoryOrder) {
+    atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
     }
 
 } // COPY from DPCT head files

From b8ffaa646e1bf519c9a05203bf98ce92a163b87d Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 19 Jun 2024 16:51:30 +0800
Subject: [PATCH 06/11] update ci cmd

---
 .devops/llama-cli-intel.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
index 6789e17afcc6e..5018c472b3c5a 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -14,7 +14,7 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
         echo "LLAMA_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
     fi && \
-    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake -B build -DLLAMA_SYCL=ON ${OPT_SYCL_F16} && \
     cmake --build build --config Release --target llama-cli
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

From 73bf3090d3f37fea8560d57dc76c2ac187412867 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 19 Jun 2024 17:15:05 +0800
Subject: [PATCH 07/11] modify true ci file

---
 .devops/llama-cli-intel.Dockerfile | 2 +-
 examples/sycl/win-build-sycl.bat   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
index 5018c472b3c5a..a76af20bd322d 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -14,7 +14,7 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
         echo "LLAMA_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
     fi && \
-    cmake -B build -DLLAMA_SYCL=ON ${OPT_SYCL_F16} && \
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx ${OPT_SYCL_F16} && \
     cmake --build build --config Release --target llama-cli
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat
index b8037aae8c4ef..0fafc967f8f95 100644
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
 
 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
 
 ::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main

From 61b628fa6ce29e3453dd195ac2dcce454d1b207c Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Thu, 20 Jun 2024 11:08:50 +0800
Subject: [PATCH 08/11] use cl as c compiler

---
 examples/sycl/win-build-sycl.bat | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat
index 0fafc967f8f95..027173b0a974b 100644
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -16,13 +16,13 @@ if %errorlevel% neq 0 goto ERROR
 ::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
 
 ::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main
 
 ::  build all binary
-make -j
+cmake --build . -j
 if %errorlevel% neq 0 goto ERROR
 
 cd ..

From 9b705f5836b96878bf3da5d80b8a19103c086f8b Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Thu, 20 Jun 2024 11:27:23 +0800
Subject: [PATCH 09/11] revert linux build cmd

---
 .devops/llama-cli-intel.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
index a76af20bd322d..6789e17afcc6e 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -14,7 +14,7 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
         echo "LLAMA_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
     fi && \
-    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx ${OPT_SYCL_F16} && \
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
     cmake --build build --config Release --target llama-cli
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

From 7b8069e9ec8c60229cdc1febb489c4b0b5c65bcc Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Thu, 20 Jun 2024 16:14:48 +0800
Subject: [PATCH 10/11] update README

---
 README-sycl.md | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/README-sycl.md b/README-sycl.md
index bd1984706225f..42c3ec316341f 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -410,15 +410,9 @@ Output (example):
 
 4. Install build tools
 
-a. Download & install cmake for Windows: https://cmake.org/download/
+a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
+b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
 
-b. Download & install mingw-w64 make for Windows provided by w64devkit
-
-- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
-
-- Extract `w64devkit` on your pc.
-
-- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
 
 ### II. Build llama.cpp
 
@@ -428,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
 @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
 
 # Option 2: Or FP16
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
 
 cmake --build build --config Release -j
 ```
@@ -441,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
 .\examples\sycl\win-build-sycl.bat
 ```
 
+Or, use CMake presets to build:
+```sh
+cmake --preset x64-windows-sycl-release
+cmake --build build-x64-windows-sycl-release -j --target llama-cli
+
+cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
+cmake --build build-x64-windows-sycl-release -j --target llama-cli
+
+cmake --preset x64-windows-sycl-debug
+cmake --build build-x64-windows-sycl-debug -j --target llama-cli
+```
+
+Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. 
+
 *Notes:*
 
-- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
+- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
 
 ### III. Run the inference
 

From 26a2a91ac681f1d57a2a121b79cf696c70c92745 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Thu, 20 Jun 2024 16:21:13 +0800
Subject: [PATCH 11/11] fix format

---
 README-sycl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README-sycl.md b/README-sycl.md
index 42c3ec316341f..b7e2bb12a68e8 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -447,7 +447,7 @@ cmake --preset x64-windows-sycl-debug
 cmake --build build-x64-windows-sycl-debug -j --target llama-cli
 ```
 
-Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. 
+Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
 
 *Notes:*