diff --git a/include/cute/arch/xe_copy_1B.hpp b/include/cute/arch/xe_copy_1B.hpp index 4a6199748..8323abb9a 100644 --- a/include/cute/arch/xe_copy_1B.hpp +++ b/include/cute/arch/xe_copy_1B.hpp @@ -37,13 +37,21 @@ #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x #else -#define SYCL_DEVICE_BUILTIN(x) inline x { CUTE_INVALID_CONTROL_PATH("Attempting to use a device built-in in host code.");} +#define SYCL_DEVICE_BUILTIN(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } #endif #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x #else -#define SYCL_DEVICE_OCL(x) inline x { CUTE_INVALID_CONTROL_PATH("Attempting to use a device built-in in host code.");} +#define SYCL_DEVICE_OCL(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } #endif using namespace cute; diff --git a/include/cute/arch/xe_copy_2B.hpp b/include/cute/arch/xe_copy_2B.hpp index 6bd10d309..ac8d7d3ed 100644 --- a/include/cute/arch/xe_copy_2B.hpp +++ b/include/cute/arch/xe_copy_2B.hpp @@ -38,13 +38,21 @@ #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x #else -#define SYCL_DEVICE_BUILTIN(x) inline x { CUTE_INVALID_CONTROL_PATH("Attempting to use a device built-in in host code.");} +#define SYCL_DEVICE_BUILTIN(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } #endif #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x #else -#define SYCL_DEVICE_OCL(x) inline x { CUTE_INVALID_CONTROL_PATH("Attempting to use a device built-in in host code.");} +#define SYCL_DEVICE_OCL(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } #endif using namespace cute; diff --git a/include/cute/arch/xe_copy_4B.hpp b/include/cute/arch/xe_copy_4B.hpp index 49df65680..78cd1471c 100644 --- a/include/cute/arch/xe_copy_4B.hpp +++ b/include/cute/arch/xe_copy_4B.hpp @@ -37,13 +37,21 @@ #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x #else -#define SYCL_DEVICE_BUILTIN(x) inline x { CUTE_INVALID_CONTROL_PATH("Attempting to use a device built-in in host code.");} +#define SYCL_DEVICE_BUILTIN(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } #endif #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x #else -#define SYCL_DEVICE_OCL(x) inline x { CUTE_INVALID_CONTROL_PATH("Attempting to use a device built-in in host code.");} +#define SYCL_DEVICE_OCL(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } #endif enum class CacheControl { diff --git a/include/cute/arch/xe_copy_8B.hpp b/include/cute/arch/xe_copy_8B.hpp index c15c630fe..f340fb5ac 100644 --- a/include/cute/arch/xe_copy_8B.hpp +++ b/include/cute/arch/xe_copy_8B.hpp @@ -37,13 +37,21 @@ #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x #else -#define SYCL_DEVICE_BUILTIN(x) inline x { CUTE_INVALID_CONTROL_PATH("Attempting to use a device built-in in host code.");} +#define SYCL_DEVICE_BUILTIN(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } #endif #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x #else -#define SYCL_DEVICE_OCL(x) inline x { CUTE_INVALID_CONTROL_PATH("Attempting to use a device built-in in host code.");} +#define SYCL_DEVICE_OCL(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } #endif using namespace cute; diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h index 308a924ea..6ec440ea2 100644 --- a/include/cutlass/arch/memory.h +++ b/include/cutlass/arch/memory.h @@ -109,7 +109,8 @@ struct global_load { "l"(((uint8_t *)ptr) + 48), "r"(data[3].x), "r"(data[3].y), "r"(data[3].z), "r"(data[3].w)); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } }; @@ -441,7 +452,8 @@ struct global_store { "r"(data[0].w), "r"((int)pred_guard), "l"(((uint8_t *)ptr) + 16), "r"(data[1].x), "r"(data[1].y), "r"(data[1].z), "r"(data[1].w)); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } }; @@ -461,7 +473,8 @@ struct global_store { : : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), "r"((int)pred_guard)); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } }; @@ -481,7 +494,8 @@ struct global_store { : : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard)); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } }; @@ -501,7 +515,8 @@ struct global_store { : : "l"(ptr), "r"(data), "r"((int)pred_guard)); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } }; @@ -521,7 +536,8 @@ struct global_store { : : "l"(ptr), "h"(data), "r"((int)pred_guard)); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } }; @@ -551,7 +567,8 @@ void shared_load<2>(void *dst, uint32_t ptr) { : "=h"(*reinterpret_cast(dst)) : "r"(ptr)); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } @@ -564,23 +581,8 @@ void shared_load<4>(void *dst, uint32_t ptr) { : "=r"(*reinterpret_cast(dst)) : "r"(ptr)); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); -#endif -} - -/// ld.shared - 64b -template <> -CUTLASS_DEVICE -void shared_load<8>(void *dst, uint32_t ptr) { - uint2 *dst_u64 = reinterpret_cast(dst); -#if defined(__CUDA_ARCH__) || defined(__SYCL_CUDA_ARCH__) - asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];\n" - : - "=r"(dst_u64->x), - "=r"(dst_u64->y) - : "r"(ptr)); -#else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } @@ -598,7 +600,8 @@ void shared_load<16>(void *dst, uint32_t ptr) { "=r"(dst_u128->w) : "r"(ptr)); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } @@ -621,7 +624,8 @@ void shared_store<2>(uint32_t ptr, void const *src) { "h"(*reinterpret_cast(src)) ); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } @@ -636,7 +640,8 @@ void shared_store<4>(uint32_t ptr, void const *src) { "r"(*reinterpret_cast(src)) ); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } @@ -653,7 +658,8 @@ void shared_store<8>(uint32_t ptr, void const *src) { "r"(dst_u64->y) ); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif } @@ -672,7 +678,8 @@ void shared_store<16>(uint32_t ptr, void const *src) { "r"(dst_u128->w) ); #else - CUTLASS_INVALID_CONTROL_PATH("Attempting to use Nvidia-specific code path on non-Nvidia hardware."); + CUTLASS_INVALID_CONTROL_PATH( + "Attempting to use Nvidia-specific code path on non-Nvidia hardware."); #endif }