diff --git a/include/cute/arch/copy_xe.hpp b/include/cute/arch/copy_xe.hpp index 7c34d49461..62f6ee9211 100644 --- a/include/cute/arch/copy_xe.hpp +++ b/include/cute/arch/copy_xe.hpp @@ -475,4 +475,26 @@ struct XE_2D_U32x8x16x1x1_ST_N } }; +/// @brief This function atomic store into global memory. +template +struct XE_ATOMIC { + using SRegisters = S[1]; + using DRegisters = D[1]; + + CUTE_STATIC_ASSERT(is_same_v || is_same_v || is_same_v); + + template + CUTE_HOST_DEVICE static void + copy(S_ const& src, D_ & dst) { + #if defined(SYCL_INTEL_TARGET) + auto v = sycl::atomic_ref(*&dst); + v += static_cast(*&src); + #else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); + #endif + } +}; + } // end namespace diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 261654506a..fa80896dd4 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -469,4 +469,18 @@ auto make_xe_2d_copy(Tensor gtensor) { Traits traits {gtensor}; return Copy_Atom {traits}; } + +template +struct Copy_Traits> { + // Logical thread id to thread idx (one-thread) + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout::value>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout::value>>>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; } // end namespace cute