From b915fa19fa9e8084439045ad968d29af7ff3cb75 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 12 Jul 2023 16:28:48 +0000 Subject: [PATCH 1/7] use advanced memory ordering instructions in CUDA --- .../{volatile.hpp.inc => memory.hpp.inc} | 0 common/cuda_hip/components/syncfree.hpp.inc | 43 +- cuda/components/memory.cuh | 789 ++++++++++++++++++ cuda/components/syncfree.cuh | 2 +- cuda/components/volatile.cuh | 58 -- cuda/solver/common_trs_kernels.cuh | 33 +- dev_tools/scripts/generate_cuda_memory_ptx.py | 192 +++++ .../{volatile.hip.hpp => memory.hip.hpp} | 70 +- hip/components/syncfree.hip.hpp | 2 +- 9 files changed, 1079 insertions(+), 110 deletions(-) rename common/cuda_hip/components/{volatile.hpp.inc => memory.hpp.inc} (100%) create mode 100644 cuda/components/memory.cuh delete mode 100644 cuda/components/volatile.cuh create mode 100755 dev_tools/scripts/generate_cuda_memory_ptx.py rename hip/components/{volatile.hip.hpp => memory.hip.hpp} (55%) diff --git a/common/cuda_hip/components/volatile.hpp.inc b/common/cuda_hip/components/memory.hpp.inc similarity index 100% rename from common/cuda_hip/components/volatile.hpp.inc rename to common/cuda_hip/components/memory.hpp.inc diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp.inc index 6b6dcc70f24..113c66d91ec 100644 --- a/common/cuda_hip/components/syncfree.hpp.inc +++ b/common/cuda_hip/components/syncfree.hpp.inc @@ -93,48 +93,31 @@ public: const auto dep_block = dependency / (block_size / subwarp_size); const auto dep_local = dependency % (block_size / subwarp_size); // assert(dependency < work_id); - if (dep_block == block_id) { - // wait for a local dependency - while (!load(local.status, dep_local)) { - __threadfence(); - } - } else { - // wait for a global dependency - while (!load(global.status, dependency)) { - __threadfence(); + if (get_lane() == 0) { + if (dep_block == block_id) { + // wait for a local dependency + while (!load_acquire_shared(local.status + dep_local)) { + } + } else { + // wait for a global dependency + while (!load_acquire(global.status + dependency)) { + } } } - __threadfence(); + group::tiled_partition(group::this_thread_block()).sync(); } - __device__ __forceinline__ bool peek(IndexType dependency) - { - const auto dep_block = dependency / (block_size / subwarp_size); - const auto dep_local = dependency % (block_size / subwarp_size); - // assert(dependency < work_id); - if (dep_block == block_id) { - // peek at a local dependency - auto finished = load(local.status, dep_local) != 0; - __threadfence(); - return finished; - } else { - // peek at a global dependency - auto finished = load(global.status, dependency); - __threadfence(); - return finished; - } - } + __device__ __forceinline__ bool peek(IndexType dependency) { return false; } __device__ __forceinline__ void mark_ready() { group::tiled_partition(group::this_thread_block()).sync(); - __threadfence(); if (get_lane() == 0) { const auto sh_id = get_work_id() % (block_size / subwarp_size); // notify local warps - store(local.status, sh_id, 1); + store_release_shared(local.status + sh_id, 1); // notify other blocks - store(global.status, get_work_id(), 1); + store_release(global.status + get_work_id(), 1); } } diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh new file mode 100644 index 00000000000..578f7c8309f --- /dev/null +++ b/cuda/components/memory.cuh @@ -0,0 +1,789 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_COMPONENTS_MEMORY_CUH_ +#define GKO_CUDA_COMPONENTS_MEMORY_CUH_ + + +#include + + +#include + + +#include "cuda/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +__device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr) +{ +// see +// https://github.com/NVIDIA/cutlass/blob/ +// 6fc5008803fe4e81b81a836fcd3a88258f4e5bbf/ +// include/cutlass/arch/memory_sm75.h#L90 +// for reasoning behind this implementation +#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ >= 11) + return static_cast(__cvta_generic_to_shared(ptr)); +#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \ + __CUDACC_VER_MINOR__ >= 2) + return __nvvm_get_smem_pointer(ptr); +#else + uint32 smem_ptr; + asm("{{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 " + "%0, smem_ptr; }}" + : "=r"(smem_ptr) + : "l"(ptr)); + return smem_ptr; +#endif +} + + +__device__ __forceinline__ uint32 membar_acq_rel() +{ +#if __CUDA_ARCH__ < 700 + asm volatile("membar.gl;" ::: "memory"); +#else + asm volatile("fence.acq_rel.gpu;" ::: "memory"); +#endif +} + + +__device__ __forceinline__ uint32 membar_acq_rel_shared() +{ +#if __CUDA_ARCH__ < 700 + asm volatile("membar.cta;" ::: "memory"); +#else + asm volatile("fence.acq_rel.cta;" ::: "memory"); +#endif +} + + +#include "common/cuda_hip/components/memory.hpp.inc" + + +__device__ __forceinline__ int32 load_relaxed_shared(int32* ptr) +{ + int32 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.b32 %0, [%1];" + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.b32 %0, [%1];" + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int64 load_relaxed_shared(int64* ptr) +{ + int64 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.b64 %0, [%1];" + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.b64 %0, [%1];" + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ float load_relaxed_shared(float* ptr) +{ + float result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.f32 %0, [%1];" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ double load_relaxed_shared(double* ptr) +{ + double result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int32 load_acquire_shared(int32* ptr) +{ + int32 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.b32 %0, [%1];" + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.acquire.cta.shared.b32 %0, [%1];" + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) +{ + membar_acq_rel_shared(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +#else + asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int64 load_acquire_shared(int64* ptr) +{ + int64 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.b64 %0, [%1];" + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.acquire.cta.shared.b64 %0, [%1];" + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) +{ + membar_acq_rel_shared(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +#else + asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ float load_acquire_shared(float* ptr) +{ + float result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.f32 %0, [%1];" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.acquire.cta.shared.f32 %0, [%1];" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(float* ptr, float result) +{ + membar_acq_rel_shared(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); +#else + asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ double load_acquire_shared(double* ptr) +{ + double result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.acquire.cta.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(double* ptr, double result) +{ + membar_acq_rel_shared(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); +#else + asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int32 load_relaxed(int32* ptr) +{ + int32 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.b32 %0, [%1];" + : "=r"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.b32 %0, [%1];" + : "=r"(result) + : "l"(ptr) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int64 load_relaxed(int64* ptr) +{ + int64 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.b64 %0, [%1];" + : "=l"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.b64 %0, [%1];" + : "=l"(result) + : "l"(ptr) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ float load_relaxed(float* ptr) +{ + float result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.f32 %0, [%1];" + : "=f"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.f32 %0, [%1];" + : "=f"(result) + : "l"(ptr) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed(float* ptr, float result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ double load_relaxed(double* ptr) +{ + double result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.f64 %0, [%1];" + : "=d"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.f64 %0, [%1];" + : "=d"(result) + : "l"(ptr) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed(double* ptr, double result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int32 load_acquire(int32* ptr) +{ + int32 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.b32 %0, [%1];" + : "=r"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.acquire.gpu.b32 %0, [%1];" + : "=r"(result) + : "l"(ptr) + : "memory"); +#endif + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(int32* ptr, int32 result) +{ + membar_acq_rel(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + : "memory"); +#else + asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int64 load_acquire(int64* ptr) +{ + int64 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.b64 %0, [%1];" + : "=l"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.acquire.gpu.b64 %0, [%1];" + : "=l"(result) + : "l"(ptr) + : "memory"); +#endif + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(int64* ptr, int64 result) +{ + membar_acq_rel(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + : "memory"); +#else + asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ float load_acquire(float* ptr) +{ + float result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.f32 %0, [%1];" + : "=f"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.acquire.gpu.f32 %0, [%1];" + : "=f"(result) + : "l"(ptr) + : "memory"); +#endif + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(float* ptr, float result) +{ + membar_acq_rel(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) + : "memory"); +#else + asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ double load_acquire(double* ptr) +{ + double result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.f64 %0, [%1];" + : "=d"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.acquire.gpu.f64 %0, [%1];" + : "=d"(result) + : "l"(ptr) + : "memory"); +#endif + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(double* ptr, double result) +{ + membar_acq_rel(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) + : "memory"); +#else + asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ thrust::complex load_relaxed_shared( + thrust::complex* ptr) +{ + float real_result; + float imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];" + : "=f"(real_result), "=f"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];" + : "=f"(real_result), "=f"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + return thrust::complex{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed_shared( + thrust::complex* ptr, thrust::complex result) +{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(real_result), "f"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(real_result), "f"(imag_result) + : "memory"); +#endif +} + + +__device__ __forceinline__ thrust::complex load_relaxed_shared( + thrust::complex* ptr) +{ + double real_result; + double imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];" + : "=d"(real_result), "=d"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];" + : "=d"(real_result), "=d"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + return thrust::complex{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed_shared( + thrust::complex* ptr, thrust::complex result) +{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(real_result), "d"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(real_result), "d"(imag_result) + : "memory"); +#endif +} + + +__device__ __forceinline__ thrust::complex load_relaxed( + thrust::complex* ptr) +{ + float real_result; + float imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];" + : "=f"(real_result), "=f"(imag_result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];" + : "=f"(real_result), "=f"(imag_result) + : "l"(ptr) + : "memory"); +#endif + return thrust::complex{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed(thrust::complex* ptr, + thrust::complex result) +{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr), + "f"(real_result), "f"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr), + "f"(real_result), "f"(imag_result) + : "memory"); +#endif +} + + +__device__ __forceinline__ thrust::complex load_relaxed( + thrust::complex* ptr) +{ + double real_result; + double imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];" + : "=d"(real_result), "=d"(imag_result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];" + : "=d"(real_result), "=d"(imag_result) + : "l"(ptr) + : "memory"); +#endif + return thrust::complex{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed(thrust::complex* ptr, + thrust::complex result) +{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr), + "d"(real_result), "d"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr), + "d"(real_result), "d"(imag_result) + : "memory"); +#endif +} + + +} // namespace cuda +} // namespace kernels +} // namespace gko + +#endif // GKO_CUDA_COMPONENTS_MEMORY_CUH_ diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh index 625f1bd8359..d00064b06b7 100644 --- a/cuda/components/syncfree.cuh +++ b/cuda/components/syncfree.cuh @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/config.hpp" #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/volatile.cuh" +#include "cuda/components/memory.cuh" namespace gko { diff --git a/cuda/components/volatile.cuh b/cuda/components/volatile.cuh deleted file mode 100644 index 96cb869c57e..00000000000 --- a/cuda/components/volatile.cuh +++ /dev/null @@ -1,58 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#ifndef GKO_CUDA_COMPONENTS_VOLATILE_CUH_ -#define GKO_CUDA_COMPONENTS_VOLATILE_CUH_ - - -#include - - -#include - - -#include "cuda/base/types.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/volatile.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - -#endif // GKO_CUDA_COMPONENTS_VOLATILE_CUH_ diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 6ee2c7521ff..546b366c6a2 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -55,9 +55,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" +#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" -#include "cuda/components/volatile.cuh" namespace gko { @@ -426,30 +426,31 @@ __global__ void sptrsv_naive_caching_kernel( : dependency * nrhs + rhs; const bool shmem_possible = (dependency_gid / default_block_size) == self_shmem_id; + ValueType val{}; if (shmem_possible) { const auto dependency_shid = dependency_gid % default_block_size; - x_p = &x_s[dependency_shid]; - } - - ValueType x = *x_p; - while (is_nan(x)) { - x = load(x_p, 0); + while (is_nan(val = load_relaxed_shared(x_s + dependency_shid))) { + } + } else { + while ( + is_nan(val = load_relaxed(x + dependency * x_stride + rhs))) { + } } - sum += x * vals[i]; + sum += val * vals[i]; } // The first entry past the triangular part will be the diagonal const auto diag = unit_diag ? one() : vals[i]; const auto r = (b[row * b_stride + rhs] - sum) / diag; - store(x_s, self_shid, r); - x[row * x_stride + rhs] = r; + store_relaxed_shared(x_s + self_shid, r); + store_relaxed(x + row * x_stride + rhs, r); // This check to ensure no infinite loops happen. if (is_nan(r)) { - store(x_s, self_shid, zero()); - x[row * x_stride + rhs] = zero(); + store_relaxed(x_s + self_shid, zero()); + store_relaxed(x + row * x_stride + rhs, zero()); *nan_produced = true; } } @@ -488,12 +489,12 @@ __global__ void sptrsv_naive_legacy_kernel( auto j = row_begin; auto col = colidxs[j]; while (j != row_end) { - auto x_val = load(x, col * x_stride + rhs); + auto x_val = load_relaxed(x + col * x_stride + rhs); while (!is_nan(x_val)) { sum += vals[j] * x_val; j += row_step; col = colidxs[j]; - x_val = load(x, col * x_stride + rhs); + x_val = load_relaxed(x + col * x_stride + rhs); } // to avoid the kernel hanging on matrices without diagonal, // we bail out if we are past the triangle, even if it's not @@ -503,12 +504,12 @@ __global__ void sptrsv_naive_legacy_kernel( // assert(row == col); auto diag = unit_diag ? one() : vals[j]; const auto r = (b[row * b_stride + rhs] - sum) / diag; - store(x, row * x_stride + rhs, r); + store_relaxed(x + row * x_stride + rhs, r); // after we encountered the diagonal, we are done // this also skips entries outside the triangle j = row_end; if (is_nan(r)) { - store(x, row * x_stride + rhs, zero()); + store_relaxed(x + row * x_stride + rhs, zero()); *nan_produced = true; } } diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py new file mode 100755 index 00000000000..a03cb47f4e7 --- /dev/null +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +import os +memory_spaces = [(".shared", ".cta", "_shared", "convert_generic_ptr_to_smem_ptr(ptr)", "r"), ("", ".gpu", "", "ptr", "l")] +memory_orderings = [ + (".relaxed", "_relaxed", ".relaxed", "_relaxed", True), + (".acquire", "_acquire", ".release", "_release", False) + ] +sizes=[(".b32", "r", "int32", 4), (".b64", "l", "int64", 8), (".f32", "f", "float", 4), (".f64", "d", "double", 8)] +# header +print("""/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_COMPONENTS_MEMORY_CUH_ +#define GKO_CUDA_COMPONENTS_MEMORY_CUH_ + + +#include + + +#include + + +#include "cuda/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +__device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr) +{ +// see +// https://github.com/NVIDIA/cutlass/blob/ +// 6fc5008803fe4e81b81a836fcd3a88258f4e5bbf/ +// include/cutlass/arch/memory_sm75.h#L90 +// for reasoning behind this implementation +#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ >= 11) + return static_cast(__cvta_generic_to_shared(ptr)); +#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \ + __CUDACC_VER_MINOR__ >= 2) + return __nvvm_get_smem_pointer(ptr); +#else + uint32 smem_ptr; + asm("{{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 " + "%0, smem_ptr; }}" + : "=r"(smem_ptr) + : "l"(ptr)); + return smem_ptr; +#endif +} + + +__device__ __forceinline__ uint32 membar_acq_rel() +{ +#if __CUDA_ARCH__ < 700 + asm volatile("membar.gl;" ::: "memory"); +#else + asm volatile("fence.acq_rel.gpu;" ::: "memory"); +#endif +} + + +__device__ __forceinline__ uint32 membar_acq_rel_shared() +{ +#if __CUDA_ARCH__ < 700 + asm volatile("membar.cta;" ::: "memory"); +#else + asm volatile("fence.acq_rel.cta;" ::: "memory"); +#endif +} + + +#include "common/cuda_hip/components/memory.hpp.inc" +""") + +# relaxed +for memory_space_suffix, scope_suffix, function_memory_space_suffix, ptr_name, ptr_constraint in memory_spaces: + for volta_load_ordering_suffix, load_function_ordering_suffix, volta_store_ordering_suffix, store_function_ordering_suffix, is_relaxed in memory_orderings: + for size_suffix, constraint, typename, size in sizes: + membar_expression = "" if is_relaxed else f"membar_acq_rel{function_memory_space_suffix}();" + print(f""" +__device__ __forceinline__ {typename} load{load_function_ordering_suffix}{function_memory_space_suffix}({typename}* ptr) +{{ + {typename} result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile{memory_space_suffix}{size_suffix} %0, [%1];" + : "={constraint}"(result) + : "{ptr_constraint}"({ptr_name}) + : "memory"); +#else + asm volatile("ld{volta_load_ordering_suffix}{scope_suffix}{memory_space_suffix}{size_suffix} %0, [%1];" + : "={constraint}"(result) + : "{ptr_constraint}"({ptr_name}) + : "memory"); +#endif + {membar_expression} + return result; +}} + + +__device__ __forceinline__ void store{store_function_ordering_suffix}{function_memory_space_suffix}({typename}* ptr, {typename} result) +{{ + {membar_expression} +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile{memory_space_suffix}{size_suffix} [%0], %1;" + :: "{ptr_constraint}"({ptr_name}), "{constraint}"(result) + : "memory"); +#else + asm volatile("st{volta_store_ordering_suffix}{scope_suffix}{memory_space_suffix}{size_suffix} [%0], %1;" + :: "{ptr_constraint}"({ptr_name}), "{constraint}"(result) + : "memory"); +#endif +}} +""") + +# vectorized relaxed loads for thrust::complex +sizes=[(".f32", "f", "float", 4), (".f64", "d", "double", 8)] +for memory_space_suffix, scope_suffix, function_memory_space_suffix, ptr_name, ptr_constraint in memory_spaces: + for size_suffix, constraint, typename, size in sizes: + print(f""" +__device__ __forceinline__ thrust::complex<{typename}> load_relaxed{function_memory_space_suffix}(thrust::complex<{typename}>* ptr) +{{ + {typename} real_result; + {typename} imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile{memory_space_suffix}.v2{size_suffix} {{%0, %1}}, [%2];" + : "={constraint}"(real_result), "={constraint}"(imag_result) + : "{ptr_constraint}"({ptr_name}) + : "memory"); +#else + asm volatile("ld.relaxed{scope_suffix}{memory_space_suffix}.v2{size_suffix} {{%0, %1}}, [%2];" + : "={constraint}"(real_result), "={constraint}"(imag_result) + : "{ptr_constraint}"({ptr_name}) + : "memory"); +#endif + return thrust::complex<{typename}>{{real_result, imag_result}}; +}} + + +__device__ __forceinline__ void store_relaxed{function_memory_space_suffix}(thrust::complex<{typename}>* ptr, thrust::complex<{typename}> result) +{{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile{memory_space_suffix}.v2{size_suffix} [%0], {{%1, %2}};" + :: "{ptr_constraint}"({ptr_name}), "{constraint}"(real_result), "{constraint}"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed{scope_suffix}{memory_space_suffix}.v2{size_suffix} [%0], {{%1, %2}};" + :: "{ptr_constraint}"({ptr_name}), "{constraint}"(real_result), "{constraint}"(imag_result) + : "memory"); +#endif +}} +""") + +print(""" +} // namespace cuda +} // namespace kernels +} // namespace gko + +#endif // GKO_CUDA_COMPONENTS_MEMORY_CUH_ +""") \ No newline at end of file diff --git a/hip/components/volatile.hip.hpp b/hip/components/memory.hip.hpp similarity index 55% rename from hip/components/volatile.hip.hpp rename to hip/components/memory.hip.hpp index de0202d8391..b424c8bbc06 100644 --- a/hip/components/volatile.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_HIP_COMPONENTS_VOLATILE_HIP_HPP_ -#define GKO_HIP_COMPONENTS_VOLATILE_HIP_HPP_ +#ifndef GKO_HIP_COMPONENTS_MEMORY_HIP_HPP_ +#define GKO_HIP_COMPONENTS_MEMORY_HIP_HPP_ #include @@ -48,11 +48,73 @@ namespace kernels { namespace hip { -#include "common/cuda_hip/components/volatile.hpp.inc" +#include "common/cuda_hip/components/memory.hpp.inc" + + +template +__device__ __forceinline__ ValueType load_relaxed(ValueType* ptr) +{ + return load(ptr, 0); +} + + +template +__device__ __forceinline__ ValueType load_acquire(ValueType* ptr) +{ + auto result = load(ptr, 0); + __threadfence(); + return result; +} + +template +__device__ __forceinline__ void store_relaxed(ValueType* ptr, ValueType value) +{ + store(ptr, 0, value); +} + + +template +__device__ __forceinline__ void store_release(ValueType* ptr, ValueType value) +{ + __threadfence(); + store(ptr, 0, value); +} + + +template +__device__ __forceinline__ ValueType load_relaxed_shared(ValueType* ptr) +{ + return load(ptr, 0); +} + + +template +__device__ __forceinline__ ValueType load_acquire_shared(ValueType* ptr) +{ + auto result = load(ptr, 0); + __threadfence(); + return result; +} + +template +__device__ __forceinline__ void store_relaxed_shared(ValueType* ptr, + ValueType value) +{ + store(ptr, 0, value); +} + + +template +__device__ __forceinline__ void store_release_shared(ValueType* ptr, + ValueType value) +{ + __threadfence(); + store(ptr, 0, value); +} } // namespace hip } // namespace kernels } // namespace gko -#endif // GKO_HIP_COMPONENTS_VOLATILE_HIP_HPP_ +#endif // GKO_HIP_COMPONENTS_MEMORY_HIP_HPP_ diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp index 232ff059585..528a9200d08 100644 --- a/hip/components/syncfree.hip.hpp +++ b/hip/components/syncfree.hip.hpp @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/config.hip.hpp" #include "hip/components/atomic.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/volatile.hip.hpp" +#include "hip/components/memory.hip.hpp" namespace gko { From b6e4d4d1f1494a74700f5f74b2a67f4c72ef50db Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 22 Sep 2023 22:36:29 +0200 Subject: [PATCH 2/7] review updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - const-correctness - add doc to generic-to-shared ptr conversion - improve generation script readability Co-authored-by: Marcel Koch Co-authored-by: Thomas Grützmacher --- cuda/components/memory.cuh | 207 +++++++++--------- dev_tools/scripts/generate_cuda_memory_ptx.py | 127 +++++++---- hip/components/memory.hip.hpp | 8 +- 3 files changed, 196 insertions(+), 146 deletions(-) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 578f7c8309f..15f2541bddf 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -48,6 +48,13 @@ namespace kernels { namespace cuda { +/** + * Transforms a generic CUDA pointer pointing to shared memory to a + * shared memory pointer for use in PTX assembly. + * CUDA PTX assembly uses 32bit pointers for shared memory addressing. + * The result is undefined for a generic pointer pointing to anything but + * shared memory. + */ __device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr) { // see @@ -94,18 +101,18 @@ __device__ __forceinline__ uint32 membar_acq_rel_shared() #include "common/cuda_hip/components/memory.hpp.inc" -__device__ __forceinline__ int32 load_relaxed_shared(int32* ptr) +__device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif @@ -117,30 +124,30 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "r"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "r"(result) : "memory"); #endif } -__device__ __forceinline__ int64 load_relaxed_shared(int64* ptr) +__device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif @@ -152,30 +159,30 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "l"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "l"(result) : "memory"); #endif } -__device__ __forceinline__ float load_relaxed_shared(float* ptr) +__device__ __forceinline__ float load_relaxed_shared(const float* ptr) { float result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif @@ -187,30 +194,30 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(result) : "memory"); #endif } -__device__ __forceinline__ double load_relaxed_shared(double* ptr) +__device__ __forceinline__ double load_relaxed_shared(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f64 %0, [%1];" : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.f64 %0, [%1];" : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif @@ -222,30 +229,30 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(result) : "memory"); #endif } -__device__ __forceinline__ int32 load_acquire_shared(int32* ptr) +__device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.acquire.cta.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif membar_acq_rel_shared(); @@ -258,30 +265,30 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "r"(result) : "memory"); #else asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "r"(result) : "memory"); #endif } -__device__ __forceinline__ int64 load_acquire_shared(int64* ptr) +__device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.acquire.cta.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif membar_acq_rel_shared(); @@ -294,30 +301,30 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "l"(result) : "memory"); #else asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "l"(result) : "memory"); #endif } -__device__ __forceinline__ float load_acquire_shared(float* ptr) +__device__ __forceinline__ float load_acquire_shared(const float* ptr) { float result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.acquire.cta.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif membar_acq_rel_shared(); @@ -330,30 +337,30 @@ __device__ __forceinline__ void store_release_shared(float* ptr, float result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(result) : "memory"); #else asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(result) : "memory"); #endif } -__device__ __forceinline__ double load_acquire_shared(double* ptr) +__device__ __forceinline__ double load_acquire_shared(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f64 %0, [%1];" : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.acquire.cta.shared.f64 %0, [%1];" : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif membar_acq_rel_shared(); @@ -366,30 +373,30 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(result) : "memory"); #else asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(result) : "memory"); #endif } -__device__ __forceinline__ int32 load_relaxed(int32* ptr) +__device__ __forceinline__ int32 load_relaxed(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b32 %0, [%1];" : "=r"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.b32 %0, [%1];" : "=r"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif @@ -400,27 +407,27 @@ __device__ __forceinline__ int32 load_relaxed(int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) : "memory"); #endif } -__device__ __forceinline__ int64 load_relaxed(int64* ptr) +__device__ __forceinline__ int64 load_relaxed(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b64 %0, [%1];" : "=l"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.b64 %0, [%1];" : "=l"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif @@ -431,27 +438,27 @@ __device__ __forceinline__ int64 load_relaxed(int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) : "memory"); #endif } -__device__ __forceinline__ float load_relaxed(float* ptr) +__device__ __forceinline__ float load_relaxed(const float* ptr) { float result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f32 %0, [%1];" : "=f"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.f32 %0, [%1];" : "=f"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif @@ -462,27 +469,27 @@ __device__ __forceinline__ float load_relaxed(float* ptr) __device__ __forceinline__ void store_relaxed(float* ptr, float result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) + asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) : "memory"); #endif } -__device__ __forceinline__ double load_relaxed(double* ptr) +__device__ __forceinline__ double load_relaxed(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f64 %0, [%1];" : "=d"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.f64 %0, [%1];" : "=d"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif @@ -493,27 +500,27 @@ __device__ __forceinline__ double load_relaxed(double* ptr) __device__ __forceinline__ void store_relaxed(double* ptr, double result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) + asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) : "memory"); #endif } -__device__ __forceinline__ int32 load_acquire(int32* ptr) +__device__ __forceinline__ int32 load_acquire(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b32 %0, [%1];" : "=r"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.acquire.gpu.b32 %0, [%1];" : "=r"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif membar_acq_rel(); @@ -525,27 +532,27 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) : "memory"); #else - asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) : "memory"); #endif } -__device__ __forceinline__ int64 load_acquire(int64* ptr) +__device__ __forceinline__ int64 load_acquire(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b64 %0, [%1];" : "=l"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.acquire.gpu.b64 %0, [%1];" : "=l"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif membar_acq_rel(); @@ -557,27 +564,27 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) : "memory"); #else - asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) : "memory"); #endif } -__device__ __forceinline__ float load_acquire(float* ptr) +__device__ __forceinline__ float load_acquire(const float* ptr) { float result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f32 %0, [%1];" : "=f"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.acquire.gpu.f32 %0, [%1];" : "=f"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif membar_acq_rel(); @@ -589,27 +596,27 @@ __device__ __forceinline__ void store_release(float* ptr, float result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) : "memory"); #else - asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) + asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) : "memory"); #endif } -__device__ __forceinline__ double load_acquire(double* ptr) +__device__ __forceinline__ double load_acquire(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f64 %0, [%1];" : "=d"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.acquire.gpu.f64 %0, [%1];" : "=d"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif membar_acq_rel(); @@ -621,29 +628,29 @@ __device__ __forceinline__ void store_release(double* ptr, double result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) : "memory"); #else - asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) + asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) : "memory"); #endif } __device__ __forceinline__ thrust::complex load_relaxed_shared( - thrust::complex* ptr) + const thrust::complex* ptr) { float real_result; float imag_result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -657,12 +664,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(real_result), "f"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -670,19 +677,19 @@ __device__ __forceinline__ void store_relaxed_shared( __device__ __forceinline__ thrust::complex load_relaxed_shared( - thrust::complex* ptr) + const thrust::complex* ptr) { double real_result; double imag_result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -696,12 +703,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(real_result), "d"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(real_result), "d"(imag_result) : "memory"); #endif @@ -709,19 +716,19 @@ __device__ __forceinline__ void store_relaxed_shared( __device__ __forceinline__ thrust::complex load_relaxed( - thrust::complex* ptr) + const thrust::complex* ptr) { float real_result; float imag_result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -734,11 +741,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr), + asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"((void*)ptr), "f"(real_result), "f"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr), + asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"((void*)ptr), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -746,19 +753,19 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, __device__ __forceinline__ thrust::complex load_relaxed( - thrust::complex* ptr) + const thrust::complex* ptr) { double real_result; double imag_result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -771,11 +778,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr), + asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"((void*)ptr), "d"(real_result), "d"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr), + asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"((void*)ptr), "d"(real_result), "d"(imag_result) : "memory"); #endif diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index a03cb47f4e7..dd5d682a9b8 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -1,11 +1,46 @@ #!/usr/bin/env python3 -import os -memory_spaces = [(".shared", ".cta", "_shared", "convert_generic_ptr_to_smem_ptr(ptr)", "r"), ("", ".gpu", "", "ptr", "l")] +import dataclasses + + +@dataclasses.dataclass +class space: + ptx_space_suffix: str + ptx_scope_suffix: str + fn_suffix: str + ptr_expr: str + ptr_constraint: str + + +@dataclasses.dataclass +class ordering: + ptx_load_suffix: str + fn_load_suffix: str + ptx_store_suffix: str + fn_store_suffix: str + is_relaxed: bool + + +@dataclasses.dataclass +class type_desc: + ptx_type_suffix: str + val_constraint: str + name: str + + +memory_spaces = [ + space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared", + ptr_expr="convert_generic_ptr_to_smem_ptr((void*)ptr)", ptr_constraint="r"), + space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="(void*)ptr", ptr_constraint="l")] memory_orderings = [ - (".relaxed", "_relaxed", ".relaxed", "_relaxed", True), - (".acquire", "_acquire", ".release", "_release", False) - ] -sizes=[(".b32", "r", "int32", 4), (".b64", "l", "int64", 8), (".f32", "f", "float", 4), (".f64", "d", "double", 8)] + ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", + ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True), + ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire", + ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False) +] +types = [type_desc(ptx_type_suffix=".b32", val_constraint="r", name="int32"), + type_desc(ptx_type_suffix=".b64", val_constraint="l", name="int64"), + type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"), + type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] # header print("""/************************************************************* Copyright (c) 2017-2023, the Ginkgo authors @@ -57,6 +92,13 @@ namespace cuda { +/** + * Transforms a generic CUDA pointer pointing to shared memory to a + * shared memory pointer for use in PTX assembly. + * CUDA PTX assembly uses 32bit pointers for shared memory addressing. + * The result is undefined for a generic pointer pointing to anything but + * shared memory. + */ __device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr) { // see @@ -104,23 +146,23 @@ """) # relaxed -for memory_space_suffix, scope_suffix, function_memory_space_suffix, ptr_name, ptr_constraint in memory_spaces: - for volta_load_ordering_suffix, load_function_ordering_suffix, volta_store_ordering_suffix, store_function_ordering_suffix, is_relaxed in memory_orderings: - for size_suffix, constraint, typename, size in sizes: - membar_expression = "" if is_relaxed else f"membar_acq_rel{function_memory_space_suffix}();" +for s in memory_spaces: + for o in memory_orderings: + for t in types: + membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" print(f""" -__device__ __forceinline__ {typename} load{load_function_ordering_suffix}{function_memory_space_suffix}({typename}* ptr) +__device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ - {typename} result; + {t.name} result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile{memory_space_suffix}{size_suffix} %0, [%1];" - : "={constraint}"(result) - : "{ptr_constraint}"({ptr_name}) + asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({s.ptr_expr}) : "memory"); #else - asm volatile("ld{volta_load_ordering_suffix}{scope_suffix}{memory_space_suffix}{size_suffix} %0, [%1];" - : "={constraint}"(result) - : "{ptr_constraint}"({ptr_name}) + asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({s.ptr_expr}) : "memory"); #endif {membar_expression} @@ -128,56 +170,57 @@ }} -__device__ __forceinline__ void store{store_function_ordering_suffix}{function_memory_space_suffix}({typename}* ptr, {typename} result) +__device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result) {{ {membar_expression} #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile{memory_space_suffix}{size_suffix} [%0], %1;" - :: "{ptr_constraint}"({ptr_name}), "{constraint}"(result) + asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" + :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(result) : "memory"); #else - asm volatile("st{volta_store_ordering_suffix}{scope_suffix}{memory_space_suffix}{size_suffix} [%0], %1;" - :: "{ptr_constraint}"({ptr_name}), "{constraint}"(result) + asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" + :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(result) : "memory"); #endif }} """) # vectorized relaxed loads for thrust::complex -sizes=[(".f32", "f", "float", 4), (".f64", "d", "double", 8)] -for memory_space_suffix, scope_suffix, function_memory_space_suffix, ptr_name, ptr_constraint in memory_spaces: - for size_suffix, constraint, typename, size in sizes: +types = [type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"), + type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] +for s in memory_spaces: + for t in types: print(f""" -__device__ __forceinline__ thrust::complex<{typename}> load_relaxed{function_memory_space_suffix}(thrust::complex<{typename}>* ptr) +__device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr) {{ - {typename} real_result; - {typename} imag_result; + {t.name} real_result; + {t.name} imag_result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile{memory_space_suffix}.v2{size_suffix} {{%0, %1}}, [%2];" - : "={constraint}"(real_result), "={constraint}"(imag_result) - : "{ptr_constraint}"({ptr_name}) + asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" + : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) + : "{s.ptr_constraint}"({s.ptr_expr}) : "memory"); #else - asm volatile("ld.relaxed{scope_suffix}{memory_space_suffix}.v2{size_suffix} {{%0, %1}}, [%2];" - : "={constraint}"(real_result), "={constraint}"(imag_result) - : "{ptr_constraint}"({ptr_name}) + asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" + : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) + : "{s.ptr_constraint}"({s.ptr_expr}) : "memory"); #endif - return thrust::complex<{typename}>{{real_result, imag_result}}; + return thrust::complex<{t.name}>{{real_result, imag_result}}; }} -__device__ __forceinline__ void store_relaxed{function_memory_space_suffix}(thrust::complex<{typename}>* ptr, thrust::complex<{typename}> result) +__device__ __forceinline__ void store_relaxed{s.fn_suffix}(thrust::complex<{t.name}>* ptr, thrust::complex<{t.name}> result) {{ auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile{memory_space_suffix}.v2{size_suffix} [%0], {{%1, %2}};" - :: "{ptr_constraint}"({ptr_name}), "{constraint}"(real_result), "{constraint}"(imag_result) + asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" + :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #else - asm volatile("st.relaxed{scope_suffix}{memory_space_suffix}.v2{size_suffix} [%0], {{%1, %2}};" - :: "{ptr_constraint}"({ptr_name}), "{constraint}"(real_result), "{constraint}"(imag_result) + asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" + :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #endif }} @@ -189,4 +232,4 @@ } // namespace gko #endif // GKO_CUDA_COMPONENTS_MEMORY_CUH_ -""") \ No newline at end of file +""") diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp index b424c8bbc06..485f67343e0 100644 --- a/hip/components/memory.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -52,14 +52,14 @@ namespace hip { template -__device__ __forceinline__ ValueType load_relaxed(ValueType* ptr) +__device__ __forceinline__ ValueType load_relaxed(const ValueType* ptr) { return load(ptr, 0); } template -__device__ __forceinline__ ValueType load_acquire(ValueType* ptr) +__device__ __forceinline__ ValueType load_acquire(const ValueType* ptr) { auto result = load(ptr, 0); __threadfence(); @@ -82,14 +82,14 @@ __device__ __forceinline__ void store_release(ValueType* ptr, ValueType value) template -__device__ __forceinline__ ValueType load_relaxed_shared(ValueType* ptr) +__device__ __forceinline__ ValueType load_relaxed_shared(const ValueType* ptr) { return load(ptr, 0); } template -__device__ __forceinline__ ValueType load_acquire_shared(ValueType* ptr) +__device__ __forceinline__ ValueType load_acquire_shared(const ValueType* ptr) { auto result = load(ptr, 0); __threadfence(); From 5b6422a1380acb5da3ec51f018a58ab17a9fdb55 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 22 Sep 2023 22:40:40 +0200 Subject: [PATCH 3/7] restore peek functionality --- common/cuda_hip/components/syncfree.hpp.inc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp.inc index 113c66d91ec..a8fa767e4dd 100644 --- a/common/cuda_hip/components/syncfree.hpp.inc +++ b/common/cuda_hip/components/syncfree.hpp.inc @@ -107,7 +107,19 @@ public: group::tiled_partition(group::this_thread_block()).sync(); } - __device__ __forceinline__ bool peek(IndexType dependency) { return false; } + __device__ __forceinline__ bool peek(IndexType dependency) + { + const auto dep_block = dependency / (block_size / subwarp_size); + const auto dep_local = dependency % (block_size / subwarp_size); + // assert(dependency < work_id); + if (dep_block == block_id) { + // peek at a local dependency + return load_acquire_shared(local.status + dep_local); + } else { + // peek at a global dependency + return load_acquire(global.status + dependency); + } + } __device__ __forceinline__ void mark_ready() { From 6505d0641e84215ea88f1d8c93bca74486116058 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 25 Sep 2023 14:44:39 +0200 Subject: [PATCH 4/7] use const_cast for CUDA atomic load/store wrappers --- cuda/components/memory.cuh | 216 ++++++++++-------- dev_tools/scripts/generate_cuda_memory_ptx.py | 22 +- 2 files changed, 136 insertions(+), 102 deletions(-) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 15f2541bddf..844fca6adf4 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -107,12 +107,12 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif @@ -124,12 +124,12 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "r"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "r"(result) : "memory"); #endif @@ -142,12 +142,12 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif @@ -159,12 +159,12 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "l"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "l"(result) : "memory"); #endif @@ -177,12 +177,12 @@ __device__ __forceinline__ float load_relaxed_shared(const float* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif @@ -194,12 +194,12 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "f"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "f"(result) : "memory"); #endif @@ -210,15 +210,17 @@ __device__ __forceinline__ double load_relaxed_shared(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) - : "memory"); + asm volatile( + "ld.volatile.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); #else - asm volatile("ld.relaxed.cta.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) - : "memory"); + asm volatile( + "ld.relaxed.cta.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); #endif return result; @@ -229,12 +231,12 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "d"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "d"(result) : "memory"); #endif @@ -247,12 +249,12 @@ __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.acquire.cta.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif membar_acq_rel_shared(); @@ -265,12 +267,12 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "r"(result) : "memory"); #else asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "r"(result) : "memory"); #endif @@ -283,12 +285,12 @@ __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.acquire.cta.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif membar_acq_rel_shared(); @@ -301,12 +303,12 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "l"(result) : "memory"); #else asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "l"(result) : "memory"); #endif @@ -319,12 +321,12 @@ __device__ __forceinline__ float load_acquire_shared(const float* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.acquire.cta.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif membar_acq_rel_shared(); @@ -337,12 +339,12 @@ __device__ __forceinline__ void store_release_shared(float* ptr, float result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "f"(result) : "memory"); #else asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "f"(result) : "memory"); #endif @@ -353,15 +355,17 @@ __device__ __forceinline__ double load_acquire_shared(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) - : "memory"); + asm volatile( + "ld.volatile.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); #else - asm volatile("ld.acquire.cta.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) - : "memory"); + asm volatile( + "ld.acquire.cta.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); #endif membar_acq_rel_shared(); return result; @@ -373,12 +377,12 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "d"(result) : "memory"); #else asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "d"(result) : "memory"); #endif @@ -391,12 +395,12 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b32 %0, [%1];" : "=r"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.b32 %0, [%1];" : "=r"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif @@ -407,10 +411,12 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), + "r"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) + asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), + "r"(result) : "memory"); #endif } @@ -422,12 +428,12 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b64 %0, [%1];" : "=l"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.b64 %0, [%1];" : "=l"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif @@ -438,10 +444,12 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), + "l"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) + asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), + "l"(result) : "memory"); #endif } @@ -453,12 +461,12 @@ __device__ __forceinline__ float load_relaxed(const float* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f32 %0, [%1];" : "=f"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.f32 %0, [%1];" : "=f"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif @@ -469,10 +477,12 @@ __device__ __forceinline__ float load_relaxed(const float* ptr) __device__ __forceinline__ void store_relaxed(float* ptr, float result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), + "f"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) + asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), + "f"(result) : "memory"); #endif } @@ -484,12 +494,12 @@ __device__ __forceinline__ double load_relaxed(const double* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f64 %0, [%1];" : "=d"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.f64 %0, [%1];" : "=d"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif @@ -500,10 +510,12 @@ __device__ __forceinline__ double load_relaxed(const double* ptr) __device__ __forceinline__ void store_relaxed(double* ptr, double result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), + "d"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) + asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), + "d"(result) : "memory"); #endif } @@ -515,12 +527,12 @@ __device__ __forceinline__ int32 load_acquire(const int32* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b32 %0, [%1];" : "=r"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.acquire.gpu.b32 %0, [%1];" : "=r"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif membar_acq_rel(); @@ -532,10 +544,12 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), + "r"(result) : "memory"); #else - asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) + asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), + "r"(result) : "memory"); #endif } @@ -547,12 +561,12 @@ __device__ __forceinline__ int64 load_acquire(const int64* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b64 %0, [%1];" : "=l"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.acquire.gpu.b64 %0, [%1];" : "=l"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif membar_acq_rel(); @@ -564,10 +578,12 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), + "l"(result) : "memory"); #else - asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) + asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), + "l"(result) : "memory"); #endif } @@ -579,12 +595,12 @@ __device__ __forceinline__ float load_acquire(const float* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f32 %0, [%1];" : "=f"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.acquire.gpu.f32 %0, [%1];" : "=f"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif membar_acq_rel(); @@ -596,10 +612,12 @@ __device__ __forceinline__ void store_release(float* ptr, float result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), + "f"(result) : "memory"); #else - asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) + asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), + "f"(result) : "memory"); #endif } @@ -611,12 +629,12 @@ __device__ __forceinline__ double load_acquire(const double* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f64 %0, [%1];" : "=d"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.acquire.gpu.f64 %0, [%1];" : "=d"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif membar_acq_rel(); @@ -628,10 +646,12 @@ __device__ __forceinline__ void store_release(double* ptr, double result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), + "d"(result) : "memory"); #else - asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) + asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), + "d"(result) : "memory"); #endif } @@ -645,12 +665,14 @@ __device__ __forceinline__ thrust::complex load_relaxed_shared( #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -664,12 +686,14 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))), "f"(real_result), "f"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -684,12 +708,14 @@ __device__ __forceinline__ thrust::complex load_relaxed_shared( #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -703,12 +729,14 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))), "d"(real_result), "d"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))), "d"(real_result), "d"(imag_result) : "memory"); #endif @@ -723,12 +751,12 @@ __device__ __forceinline__ thrust::complex load_relaxed( #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "l"((void*)ptr) + : "l"(const_cast*>(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "l"((void*)ptr) + : "l"(const_cast*>(ptr)) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -741,11 +769,13 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"((void*)ptr), + asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"( + const_cast*>(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"((void*)ptr), + asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"( + const_cast*>(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -760,12 +790,12 @@ __device__ __forceinline__ thrust::complex load_relaxed( #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "l"((void*)ptr) + : "l"(const_cast*>(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "l"((void*)ptr) + : "l"(const_cast*>(ptr)) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -778,11 +808,13 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"((void*)ptr), + asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"( + const_cast*>(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"((void*)ptr), + asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"( + const_cast*>(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #endif diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index dd5d682a9b8..dae5f6c3a59 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -29,8 +29,8 @@ class type_desc: memory_spaces = [ space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared", - ptr_expr="convert_generic_ptr_to_smem_ptr((void*)ptr)", ptr_constraint="r"), - space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="(void*)ptr", ptr_constraint="l")] + ptr_expr="convert_generic_ptr_to_smem_ptr(const_cast<{typename}*>(ptr))", ptr_constraint="r"), + space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="const_cast<{typename}*>(ptr)", ptr_constraint="l")] memory_orderings = [ ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True), @@ -150,6 +150,7 @@ class type_desc: for o in memory_orderings: for t in types: membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" + ptr_expr = s.ptr_expr.format(typename=t.name) print(f""" __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ @@ -157,12 +158,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({s.ptr_expr}) + : "{s.ptr_constraint}"({ptr_expr}) : "memory"); #else asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({s.ptr_expr}) + : "{s.ptr_constraint}"({ptr_expr}) : "memory"); #endif {membar_expression} @@ -175,11 +176,11 @@ class type_desc: {membar_expression} #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) : "memory"); #else asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) : "memory"); #endif }} @@ -190,6 +191,7 @@ class type_desc: type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] for s in memory_spaces: for t in types: + ptr_expr = s.ptr_expr.format(typename=f"thrust::complex<{t.name}>") print(f""" __device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr) {{ @@ -198,12 +200,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({s.ptr_expr}) + : "{s.ptr_constraint}"({ptr_expr}) : "memory"); #else asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({s.ptr_expr}) + : "{s.ptr_constraint}"({ptr_expr}) : "memory"); #endif return thrust::complex<{t.name}>{{real_result, imag_result}}; @@ -216,11 +218,11 @@ class type_desc: auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #else asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #endif }} From 33b1de85592a70612ca1fba9da12739cf44965f6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 25 Sep 2023 15:59:26 +0200 Subject: [PATCH 5/7] remove unnecessary const casts --- cuda/components/memory.cuh | 104 +++++++----------- dev_tools/scripts/generate_cuda_memory_ptx.py | 28 +++-- 2 files changed, 56 insertions(+), 76 deletions(-) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 844fca6adf4..af3a0e838ea 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -124,12 +124,12 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #endif @@ -159,12 +159,12 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #endif @@ -194,12 +194,12 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #endif @@ -231,12 +231,12 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #endif @@ -267,12 +267,12 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #endif @@ -303,12 +303,12 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #endif @@ -339,12 +339,12 @@ __device__ __forceinline__ void store_release_shared(float* ptr, float result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #else asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #endif @@ -377,12 +377,12 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #else asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #endif @@ -411,12 +411,10 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -444,12 +442,10 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } @@ -477,12 +473,10 @@ __device__ __forceinline__ float load_relaxed(const float* ptr) __device__ __forceinline__ void store_relaxed(float* ptr, float result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #endif } @@ -510,12 +504,10 @@ __device__ __forceinline__ double load_relaxed(const double* ptr) __device__ __forceinline__ void store_relaxed(double* ptr, double result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #endif } @@ -544,12 +536,10 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -578,12 +568,10 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } @@ -612,12 +600,10 @@ __device__ __forceinline__ void store_release(float* ptr, float result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #else - asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #endif } @@ -646,12 +632,10 @@ __device__ __forceinline__ void store_release(double* ptr, double result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #else - asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #endif } @@ -686,14 +670,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -729,14 +711,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #endif @@ -769,13 +749,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr), "f"(real_result), "f"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -808,13 +786,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr), "d"(real_result), "d"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr), "d"(real_result), "d"(imag_result) : "memory"); #endif diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index dae5f6c3a59..4cbe05361c1 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -29,8 +29,8 @@ class type_desc: memory_spaces = [ space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared", - ptr_expr="convert_generic_ptr_to_smem_ptr(const_cast<{typename}*>(ptr))", ptr_constraint="r"), - space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="const_cast<{typename}*>(ptr)", ptr_constraint="l")] + ptr_expr="convert_generic_ptr_to_smem_ptr({ptr})", ptr_constraint="r"), + space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="{ptr}", ptr_constraint="l")] memory_orderings = [ ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True), @@ -150,7 +150,9 @@ class type_desc: for o in memory_orderings: for t in types: membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" - ptr_expr = s.ptr_expr.format(typename=t.name) + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast<{t.name}*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") print(f""" __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ @@ -158,12 +160,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #else asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #endif {membar_expression} @@ -176,11 +178,11 @@ class type_desc: {membar_expression} #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) : "memory"); #else asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) : "memory"); #endif }} @@ -191,7 +193,9 @@ class type_desc: type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] for s in memory_spaces: for t in types: - ptr_expr = s.ptr_expr.format(typename=f"thrust::complex<{t.name}>") + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") print(f""" __device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr) {{ @@ -200,12 +204,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #else asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #endif return thrust::complex<{t.name}>{{real_result, imag_result}}; @@ -218,11 +222,11 @@ class type_desc: auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #else asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #endif }} From c6706ab904357069da8b92b61b6aff04a4f935fc Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 9 Oct 2023 11:29:35 +0200 Subject: [PATCH 6/7] review updates - update asm type annotations - fix incorrect store Co-authored-by: Yuhsiang M. Tsai --- cuda/components/memory.cuh | 64 +++++++++---------- cuda/solver/common_trs_kernels.cuh | 2 +- dev_tools/scripts/generate_cuda_memory_ptx.py | 4 +- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index af3a0e838ea..4d814c7f513 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -105,12 +105,12 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.b32 %0, [%1];" + asm volatile("ld.volatile.shared.s32 %0, [%1];" : "=r"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else - asm volatile("ld.relaxed.cta.shared.b32 %0, [%1];" + asm volatile("ld.relaxed.cta.shared.s32 %0, [%1];" : "=r"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); @@ -123,12 +123,12 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( + asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else - asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( + asm volatile("st.relaxed.cta.shared.s32 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); @@ -140,12 +140,12 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.b64 %0, [%1];" + asm volatile("ld.volatile.shared.s64 %0, [%1];" : "=l"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else - asm volatile("ld.relaxed.cta.shared.b64 %0, [%1];" + asm volatile("ld.relaxed.cta.shared.s64 %0, [%1];" : "=l"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); @@ -158,12 +158,12 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( + asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else - asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( + asm volatile("st.relaxed.cta.shared.s64 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); @@ -247,12 +247,12 @@ __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.b32 %0, [%1];" + asm volatile("ld.volatile.shared.s32 %0, [%1];" : "=r"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else - asm volatile("ld.acquire.cta.shared.b32 %0, [%1];" + asm volatile("ld.acquire.cta.shared.s32 %0, [%1];" : "=r"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); @@ -266,12 +266,12 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) { membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( + asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else - asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( + asm volatile("st.release.cta.shared.s32 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); @@ -283,12 +283,12 @@ __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.b64 %0, [%1];" + asm volatile("ld.volatile.shared.s64 %0, [%1];" : "=l"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else - asm volatile("ld.acquire.cta.shared.b64 %0, [%1];" + asm volatile("ld.acquire.cta.shared.s64 %0, [%1];" : "=l"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); @@ -302,12 +302,12 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) { membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( + asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else - asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( + asm volatile("st.release.cta.shared.s64 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); @@ -393,12 +393,12 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.b32 %0, [%1];" + asm volatile("ld.volatile.s32 %0, [%1];" : "=r"(result) : "l"(const_cast(ptr)) : "memory"); #else - asm volatile("ld.relaxed.gpu.b32 %0, [%1];" + asm volatile("ld.relaxed.gpu.s32 %0, [%1];" : "=r"(result) : "l"(const_cast(ptr)) : "memory"); @@ -411,10 +411,10 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.relaxed.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -424,12 +424,12 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.b64 %0, [%1];" + asm volatile("ld.volatile.s64 %0, [%1];" : "=l"(result) : "l"(const_cast(ptr)) : "memory"); #else - asm volatile("ld.relaxed.gpu.b64 %0, [%1];" + asm volatile("ld.relaxed.gpu.s64 %0, [%1];" : "=l"(result) : "l"(const_cast(ptr)) : "memory"); @@ -442,10 +442,10 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.relaxed.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } @@ -517,12 +517,12 @@ __device__ __forceinline__ int32 load_acquire(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.b32 %0, [%1];" + asm volatile("ld.volatile.s32 %0, [%1];" : "=r"(result) : "l"(const_cast(ptr)) : "memory"); #else - asm volatile("ld.acquire.gpu.b32 %0, [%1];" + asm volatile("ld.acquire.gpu.s32 %0, [%1];" : "=r"(result) : "l"(const_cast(ptr)) : "memory"); @@ -536,10 +536,10 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.release.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -549,12 +549,12 @@ __device__ __forceinline__ int64 load_acquire(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.b64 %0, [%1];" + asm volatile("ld.volatile.s64 %0, [%1];" : "=l"(result) : "l"(const_cast(ptr)) : "memory"); #else - asm volatile("ld.acquire.gpu.b64 %0, [%1];" + asm volatile("ld.acquire.gpu.s64 %0, [%1];" : "=l"(result) : "l"(const_cast(ptr)) : "memory"); @@ -568,10 +568,10 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.release.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 546b366c6a2..6dbd65968d0 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -449,7 +449,7 @@ __global__ void sptrsv_naive_caching_kernel( // This check to ensure no infinite loops happen. if (is_nan(r)) { - store_relaxed(x_s + self_shid, zero()); + store_relaxed_shared(x_s + self_shid, zero()); store_relaxed(x + row * x_stride + rhs, zero()); *nan_produced = true; } diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index 4cbe05361c1..d75a9f908b8 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -37,8 +37,8 @@ class type_desc: ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire", ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False) ] -types = [type_desc(ptx_type_suffix=".b32", val_constraint="r", name="int32"), - type_desc(ptx_type_suffix=".b64", val_constraint="l", name="int64"), +types = [type_desc(ptx_type_suffix=".s32", val_constraint="r", name="int32"), + type_desc(ptx_type_suffix=".s64", val_constraint="l", name="int64"), type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"), type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] # header From 5acabea258fe5a8a3b769861040b47e786e8fc54 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 10 Oct 2023 10:50:50 +0200 Subject: [PATCH 7/7] add note to generated file --- cuda/components/memory.cuh | 3 +++ dev_tools/scripts/generate_cuda_memory_ptx.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 4d814c7f513..a1a53284e3f 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -43,6 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/types.hpp" +// this file is generated by dev_tools/scripts/generate_cuda_memory_ptx.py + + namespace gko { namespace kernels { namespace cuda { diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index d75a9f908b8..42bef50f9a2 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -87,6 +87,9 @@ class type_desc: #include "cuda/base/types.hpp" +// this file is generated by dev_tools/scripts/generate_cuda_memory_ptx.py + + namespace gko { namespace kernels { namespace cuda {