intel · airMeng · May 17, 2024 · May 21, 2024 · May 22, 2024 · May 24, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -66,7 +66,7 @@ else()
     set(XETLA_KERNEL_FLAGS ${XETLA_KERNEL_FLAGS} -Xs "${XETLA_OFFLINE_OPTIONS}")
 endif()
 
-add_compile_options(-fsycl -fsycl-device-code-split=per_kernel)
+add_compile_options(-fsycl -fsycl-device-code-split=per_kernel  -ftemplate-backtrace-limit=0)
 add_compile_options(-Wall -Wextra -Werror)
 
 include(ProcessorCount)

diff --git a/examples/05_batch_gemm/batch_gemm.hpp b/examples/05_batch_gemm/batch_gemm.hpp
@@ -276,20 +276,20 @@ class batch_gemm_t {
                 args.matB_base.base, args.matB_ld);
       }
     }
-    if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
-      if (epilogue_t::msg_type_c == msg_type::block_2d) {
-        implementable &=
-            kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
-                (uint64_t)(args.matC_base.base),
-                args.matrix_n,
-                args.matrix_m * args.batch_size,
-                args.matC_ld);
-      } else {
-        implementable &=
-            kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
-                args.matC_base.base, args.matC_ld);
-      }
-    }
+    // if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
+    //   if (epilogue_t::msg_type_c == msg_type::block_2d) {
+    //     implementable &=
+    //         kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
+    //             (uint64_t)(args.matC_base.base),
+    //             args.matrix_n,
+    //             args.matrix_m * args.batch_size,
+    //             args.matC_ld);
+    //   } else {
+    //     implementable &=
+    //         kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
+    //             args.matC_base.base, args.matC_ld);
+    //   }
+    // }
 
     return implementable;
   }

diff --git a/examples/07_multi_layer_perceptron/multi_layer_perceptron.hpp b/examples/07_multi_layer_perceptron/multi_layer_perceptron.hpp
@@ -409,20 +409,20 @@ class multi_layer_perceptron_t {
                 args.matW_base.base, args.matW_ld);
       }
     }
-    if (epilogue_layer1_t::msg_type_c != msg_type::unaligned_2d) {
-      if (epilogue_layer1_t::msg_type_c == msg_type::block_2d) {
-        implementable &=
-            kernel::block_2d<gpu_arch::XeHpc, dtype_b>::check_tensor(
-                (uint64_t)(args.matB_base.base),
-                args.matrix_n_layer1,
-                args.matrix_m_layer1,
-                args.matB_ld);
-      } else {
-        implementable &=
-            kernel::general_1d<gpu_arch::XeHpc, dtype_b>::check_alignment(
-                args.matB_base.base, args.matB_ld);
-      }
-    }
+    // if (epilogue_layer1_t::msg_type_c != msg_type::unaligned_2d) {
+    //   if (epilogue_layer1_t::msg_type_c == msg_type::block_2d) {
+    //     implementable &=
+    //         kernel::block_2d<gpu_arch::XeHpc, dtype_b>::check_tensor(
+    //             (uint64_t)(args.matB_base.base),
+    //             args.matrix_n_layer1,
+    //             args.matrix_m_layer1,
+    //             args.matB_ld);
+    //   } else {
+    //     implementable &=
+    //         kernel::general_1d<gpu_arch::XeHpc, dtype_b>::check_alignment(
+    //             args.matB_base.base, args.matB_ld);
+    //   }
+    // }
     if (gemm_layer2_t::msg_type_a != msg_type::unaligned_2d) {
       if (gemm_layer2_t::msg_type_a == msg_type::block_2d) {
         implementable &=
@@ -451,20 +451,20 @@ class multi_layer_perceptron_t {
                 args.matV_base.base, args.matV_ld);
       }
     }
-    if (epilogue_layer2_t::msg_type_c != msg_type::unaligned_2d) {
-      if (epilogue_layer2_t::msg_type_c == msg_type::block_2d) {
-        implementable &=
-            kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
-                (uint64_t)(args.matC_base.base),
-                args.matrix_n_layer2,
-                args.matrix_m_layer2,
-                args.matC_ld);
-      } else {
-        implementable &=
-            kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
-                args.matC_base.base, args.matC_ld);
-      }
-    }
+    // if (epilogue_layer2_t::msg_type_c != msg_type::unaligned_2d) {
+    //   if (epilogue_layer2_t::msg_type_c == msg_type::block_2d) {
+    //     implementable &=
+    //         kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
+    //             (uint64_t)(args.matC_base.base),
+    //             args.matrix_n_layer2,
+    //             args.matrix_m_layer2,
+    //             args.matC_ld);
+    //   } else {
+    //     implementable &=
+    //         kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
+    //             args.matC_base.base, args.matC_ld);
+    //   }
+    // }
 
     return implementable;
   }

diff --git a/examples/08_scaled_dot_product_attention/softmax.hpp b/examples/08_scaled_dot_product_attention/softmax.hpp
@@ -60,18 +60,21 @@ struct xetla_softmax_fwd_t {
   using softmax_tile_desc_t = subgroup::
       tile_desc_t<SIMD, block_height, SIMD, block_height, reg_layout::tiled>;
   using softmax_load_t = subgroup::tile_t<dtype_in, softmax_tile_desc_t>;
+  using mem_desc_in_t = mem_desc_t<dtype_in, mem_layout::row_major, mem_space_in>;
   using softmax_load_payload_t = subgroup::mem_payload_t<
-      mem_desc_t<dtype_in, mem_layout::row_major, mem_space_in>,
+      mem_desc_in_t,
       softmax_tile_desc_t,
-      subgroup::msg_type_v<softmax_tile_desc_t, mem_space_in>,
+      subgroup::msg_type_v<softmax_tile_desc_t, mem_desc_in_t>,
       arch_tag>;
 
   // this tile will store the softmax result to global memory
   using softmax_store_t = subgroup::tile_t<dtype_out, softmax_tile_desc_t>;
+  using mem_desc_out_t =
+      mem_desc_t<dtype_out, mem_layout::row_major, mem_space_out>;
   using softmax_store_payload_t = subgroup::mem_payload_t<
-      mem_desc_t<dtype_out, mem_layout::row_major, mem_space_out>,
+      mem_desc_out_t,
       softmax_tile_desc_t,
-      subgroup::msg_type_v<softmax_tile_desc_t, mem_space_out>,
+      subgroup::msg_type_v<softmax_tile_desc_t, mem_desc_out_t>,
       arch_tag>;
 
   struct arguments_t {

diff --git a/examples/09_gate_recurrent_unit/kernel_func.hpp b/examples/09_gate_recurrent_unit/kernel_func.hpp
@@ -156,7 +156,7 @@ struct gru_layer {
   using mat_hidden_payload_t = mem_payload_t<
       mem_desc_a_t,
       matC_tile_desc_t,
-      msg_type_v<matC_tile_desc_t, mem_loc_input>,
+      msg_type_v<matC_tile_desc_t, mem_desc_a_t>,
       gpu_arch::XeHpc>;
   using matC_payload_t = mem_payload_t<
       mem_desc_c_t,

diff --git a/include/common/core/arch_config.hpp b/include/common/core/arch_config.hpp
@@ -31,9 +31,8 @@ struct load_store_attr_t {
   static constexpr bool has_hw_block_2d = false;
 };
 
-template <>
-struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc> {
-  /// HW limitation checks https://gfxspecs.intel.com/Predator/Home/Index/55490
+template <msg_type message_type, gpu_arch arg_tag>
+struct xe_plus_load_store_attr_t {
   static constexpr bool has_hw_block_2d = true;
   static constexpr uint32_t max_load_height_in_elem = 32;
   static constexpr uint32_t max_load_width_in_bytes = 64;
@@ -55,10 +54,9 @@ struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc> {
 
 template <msg_type message_type, gpu_arch arg_tag>
 struct client_load_store_attr_base_t {
-  /// HW limitation checks https://gfxspecs.intel.com/Predator/Home/Index/55490
   static constexpr bool has_hw_block_2d = false;
-  static constexpr uint32_t max_load_height_in_elem = 32;
-  static constexpr uint32_t max_load_width_in_bytes = 64;
+  static constexpr uint32_t max_load_height_in_elem = 0;
+  static constexpr uint32_t max_load_width_in_bytes = 0;
   static constexpr uint32_t max_trans_load_width_in_bytes = 32;
   static constexpr uint32_t max_vnni_load_width_in_elems = 16;
   static constexpr uint32_t min_vnni_load_height_in_bytes = 4;
@@ -87,21 +85,40 @@ struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeLpg>
           msg_type::block_2d,
           gpu_arch::XeLpg> {};
 
+template <>
+struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc>
+    : public xe_plus_load_store_attr_base_t<
+          msg_type::block_2d,
+          gpu_arch::XeHpc> {};
+
+template <>
+struct load_store_attr_t<msg_type::block_2d, gpu_arch::Xe2>
+    : public xe_plus_load_store_attr_base_t<
+          msg_type::block_2d,
+          gpu_arch::Xe2> {};
+
 template <gpu_arch arch_tag>
 inline constexpr bool arch_has_2d_load_store =
     load_store_attr_t<msg_type::block_2d, arch_tag>::has_hw_block_2d;
 
 template <gpu_arch arch_tag>
 struct load_store_attr_t<msg_type::block_1d, arch_tag> {
-  static constexpr uint32_t max_load_vec_len = 32;
-  static constexpr uint32_t max_store_vec_len = 32;
+  static constexpr uint32_t max_load_vec_len = 256;
+  static constexpr uint32_t max_store_vec_len = 256;
   static constexpr uint32_t max_prefetch_vec_len = 32;
 };
 
 template <>
 struct load_store_attr_t<msg_type::block_1d, gpu_arch::XeHpc> {
-  static constexpr uint32_t max_load_vec_len = 64;
-  static constexpr uint32_t max_store_vec_len = 64;
+  static constexpr uint32_t max_load_vec_len = 512;
+  static constexpr uint32_t max_store_vec_len = 512;
+  static constexpr uint32_t max_prefetch_vec_len = 64;
+};
+
+template <>
+struct load_store_attr_t<msg_type::block_1d, gpu_arch::Xe2> {
+  static constexpr uint32_t max_load_vec_len = 512;
+  static constexpr uint32_t max_store_vec_len = 512;
   static constexpr uint32_t max_prefetch_vec_len = 64;
 };
 
@@ -129,6 +146,11 @@ struct dpas_attr_t<gpu_arch::XeHpg> : public dpas_attr_base_t {
   static constexpr uint32_t n_fixed_limit = 8;
 };
 
+template <>
+struct dpas_attr_t<gpu_arch::Xe2> : public dpas_attr_t<gpu_arch::XeHpc> {
+  static constexpr uint32_t systolic_depth = 4;
+};
+
 template <gpu_arch arch_tag>
 inline constexpr bool arch_has_xmx = dpas_attr_t<arch_tag>::has_xmx;
 
@@ -162,6 +184,10 @@ template <>
 struct register_bytes_t<gpu_arch::XeLpg> {
   static constexpr uint32_t reg_in_bytes = 32;
 };
+template <>
+struct register_bytes_t<gpu_arch::Xe2> {
+  static constexpr uint32_t reg_in_bytes = 64;
+};
 
 template <grf_mode grf_num_mode, gpu_arch arch_tag>
 struct register_attr_t {
@@ -236,10 +262,25 @@ struct arch_attr_t<gpu_arch::XeLpg> {
 
   using dpas_attr = dpas_attr_t<gpu_arch::XeLpg>;
 
-  static constexpr uint32_t max_wg_num = 64;
+  static constexpr uint32_t max_wg_num = 16;
   static constexpr uint32_t local_mem_size = 64 * 1024;
 };
 
+template <>
+struct arch_attr_t<gpu_arch::Xe2> {
+  template <msg_type message_type = msg_type::block_2d>
+  using load_store_attr = load_store_attr_t<message_type, gpu_arch::Xe2>;
+
+  template <grf_mode grf_num_mode = grf_mode::double_grf>
+  using register_attr = register_attr_t<grf_num_mode, gpu_arch::Xe2>;
+
+  using dpas_attr = dpas_attr_t<gpu_arch::Xe2>;
+
+  static constexpr uint32_t max_wg_num = 16;
+  static constexpr uint32_t local_mem_size = 128 * 1024;
+};
+
+
 /// @} xetla_core_arch_config
 
 } // namespace gpu::xetla
diff --git a/include/common/core/base_consts.hpp b/include/common/core/base_consts.hpp
@@ -23,9 +23,8 @@
 
 namespace gpu::xetla {
 
-/// @addtogroup xetla_core_base_types
+/// @addtogroup xetla_core_base_consts
 /// @{
-
-/// @} xetla_core_base_types
+/// @} xetla_core_base_consts
 
 } // namespace gpu::xetla
diff --git a/include/common/core/base_types.hpp b/include/common/core/base_types.hpp
@@ -55,6 +55,32 @@ using fp16 = sycl::half;
 ///
 using tf32 = sycl::ext::intel::experimental::esimd::tfloat32;
 
+/// @brief xetla 4bits data packed as 8bits data type.
+/// 2 4bit data pack to one byte
+struct int4x2 {
+  uint8_t data;
+
+  operator uint8_t() const {
+    return data;
+  }
+  int4x2(uint8_t val) {
+    data = val;
+  }
+};
+
+/// @brief xetla 4bits data packed as 32bits data type.
+/// 8 4bit data pack to 4 bytes
+struct int4x8 {
+  uint32_t data;
+
+  operator uint32_t() const {
+    return data;
+  }
+  int4x8(uint32_t val) {
+    data = val;
+  }
+};
+
 /// @brief mx_fp4(E2M1) data packed as 8bits data type.
 struct mx_fp4 {
   uint8_t data;
@@ -89,6 +115,8 @@ template <typename T>
 struct is_internal_type {
   static constexpr bool value = std::is_same<remove_const_t<T>, bf16>::value ||
       std::is_same<remove_const_t<T>, tf32>::value ||
+      std::is_same<remove_const_t<T>, int4x2>::value ||
+      std::is_same<remove_const_t<T>, int4x8>::value ||
       std::is_same<remove_const_t<T>, mx_fp4>::value;
 };
 template <typename T>
@@ -137,6 +165,18 @@ struct native_type<mx_fp4> {
   using type = uint8_t;
 };
 
+/// @brief Set uint8_t as the native data type of int4x2.
+template <>
+struct native_type<int4x2> {
+  using type = uint8_t;
+};
+
+/// @brief Set uint8_t as the native data type of int4x8.
+template <>
+struct native_type<int4x8> {
+  using type = uint32_t;
+};
+
 /// @brief Return the native data type of T
 template <typename T>
 using native_type_t = typename native_type<T>::type;

diff --git a/include/common/core/common_types.hpp b/include/common/core/common_types.hpp
@@ -21,9 +21,18 @@
 #include <cstdint>
 
 namespace gpu::xetla {
-enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2 };
+enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2, Xe2 = 3 };
 
 enum class grf_mode : uint8_t { normal = 0, double_grf = 1 };
 
 enum class mem_layout : uint8_t { row_major = 0, col_major = 1 };
+
+enum class quant_mode : uint8_t { S4_ASYM = 0, S4_FULLRANGE_NO_ZP = 1 };
+
+struct quant_info {
+  quant_mode quant_mode;
+  uint32_t dequant_s;
+  mem_layout weight_mem_layout;
+};
+
 } // namespace gpu::xetla