Skip to content

Commit

Permalink
review updates
Browse files Browse the repository at this point in the history
- update asm type annotations
- fix incorrect store

Co-authored-by: Yuhsiang M. Tsai <[email protected]>
  • Loading branch information
upsj and yhmtsai committed Oct 10, 2023
1 parent 33b1de8 commit c6706ab
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 35 deletions.
64 changes: 32 additions & 32 deletions cuda/components/memory.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,12 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr)
{
int32 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.shared.b32 %0, [%1];"
asm volatile("ld.volatile.shared.s32 %0, [%1];"
: "=r"(result)
: "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
: "memory");
#else
asm volatile("ld.relaxed.cta.shared.b32 %0, [%1];"
asm volatile("ld.relaxed.cta.shared.s32 %0, [%1];"
: "=r"(result)
: "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
: "memory");
Expand All @@ -123,12 +123,12 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr)
__device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"(
asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(ptr)),
"r"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"(
asm volatile("st.relaxed.cta.shared.s32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(ptr)),
"r"(result)
: "memory");
Expand All @@ -140,12 +140,12 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
{
int64 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.shared.b64 %0, [%1];"
asm volatile("ld.volatile.shared.s64 %0, [%1];"
: "=l"(result)
: "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
: "memory");
#else
asm volatile("ld.relaxed.cta.shared.b64 %0, [%1];"
asm volatile("ld.relaxed.cta.shared.s64 %0, [%1];"
: "=l"(result)
: "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
: "memory");
Expand All @@ -158,12 +158,12 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
__device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"(
asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(ptr)),
"l"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"(
asm volatile("st.relaxed.cta.shared.s64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(ptr)),
"l"(result)
: "memory");
Expand Down Expand Up @@ -247,12 +247,12 @@ __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
{
int32 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.shared.b32 %0, [%1];"
asm volatile("ld.volatile.shared.s32 %0, [%1];"
: "=r"(result)
: "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
: "memory");
#else
asm volatile("ld.acquire.cta.shared.b32 %0, [%1];"
asm volatile("ld.acquire.cta.shared.s32 %0, [%1];"
: "=r"(result)
: "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
: "memory");
Expand All @@ -266,12 +266,12 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result)
{
membar_acq_rel_shared();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"(
asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(ptr)),
"r"(result)
: "memory");
#else
asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"(
asm volatile("st.release.cta.shared.s32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(ptr)),
"r"(result)
: "memory");
Expand All @@ -283,12 +283,12 @@ __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr)
{
int64 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.shared.b64 %0, [%1];"
asm volatile("ld.volatile.shared.s64 %0, [%1];"
: "=l"(result)
: "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
: "memory");
#else
asm volatile("ld.acquire.cta.shared.b64 %0, [%1];"
asm volatile("ld.acquire.cta.shared.s64 %0, [%1];"
: "=l"(result)
: "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
: "memory");
Expand All @@ -302,12 +302,12 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result)
{
membar_acq_rel_shared();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"(
asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(ptr)),
"l"(result)
: "memory");
#else
asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"(
asm volatile("st.release.cta.shared.s64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(ptr)),
"l"(result)
: "memory");
Expand Down Expand Up @@ -393,12 +393,12 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr)
{
int32 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.b32 %0, [%1];"
asm volatile("ld.volatile.s32 %0, [%1];"
: "=r"(result)
: "l"(const_cast<int32*>(ptr))
: "memory");
#else
asm volatile("ld.relaxed.gpu.b32 %0, [%1];"
asm volatile("ld.relaxed.gpu.s32 %0, [%1];"
: "=r"(result)
: "l"(const_cast<int32*>(ptr))
: "memory");
Expand All @@ -411,10 +411,10 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr)
__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result)
asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#else
asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result)
asm volatile("st.relaxed.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#endif
}
Expand All @@ -424,12 +424,12 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr)
{
int64 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.b64 %0, [%1];"
asm volatile("ld.volatile.s64 %0, [%1];"
: "=l"(result)
: "l"(const_cast<int64*>(ptr))
: "memory");
#else
asm volatile("ld.relaxed.gpu.b64 %0, [%1];"
asm volatile("ld.relaxed.gpu.s64 %0, [%1];"
: "=l"(result)
: "l"(const_cast<int64*>(ptr))
: "memory");
Expand All @@ -442,10 +442,10 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr)
__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result)
asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#else
asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result)
asm volatile("st.relaxed.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#endif
}
Expand Down Expand Up @@ -517,12 +517,12 @@ __device__ __forceinline__ int32 load_acquire(const int32* ptr)
{
int32 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.b32 %0, [%1];"
asm volatile("ld.volatile.s32 %0, [%1];"
: "=r"(result)
: "l"(const_cast<int32*>(ptr))
: "memory");
#else
asm volatile("ld.acquire.gpu.b32 %0, [%1];"
asm volatile("ld.acquire.gpu.s32 %0, [%1];"
: "=r"(result)
: "l"(const_cast<int32*>(ptr))
: "memory");
Expand All @@ -536,10 +536,10 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result)
{
membar_acq_rel();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result)
asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#else
asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result)
asm volatile("st.release.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#endif
}
Expand All @@ -549,12 +549,12 @@ __device__ __forceinline__ int64 load_acquire(const int64* ptr)
{
int64 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.b64 %0, [%1];"
asm volatile("ld.volatile.s64 %0, [%1];"
: "=l"(result)
: "l"(const_cast<int64*>(ptr))
: "memory");
#else
asm volatile("ld.acquire.gpu.b64 %0, [%1];"
asm volatile("ld.acquire.gpu.s64 %0, [%1];"
: "=l"(result)
: "l"(const_cast<int64*>(ptr))
: "memory");
Expand All @@ -568,10 +568,10 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result)
{
membar_acq_rel();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result)
asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#else
asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result)
asm volatile("st.release.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#endif
}
Expand Down
2 changes: 1 addition & 1 deletion cuda/solver/common_trs_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ __global__ void sptrsv_naive_caching_kernel(

// This check to ensure no infinite loops happen.
if (is_nan(r)) {
store_relaxed(x_s + self_shid, zero<ValueType>());
store_relaxed_shared(x_s + self_shid, zero<ValueType>());
store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
*nan_produced = true;
}
Expand Down
4 changes: 2 additions & 2 deletions dev_tools/scripts/generate_cuda_memory_ptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ class type_desc:
ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire",
ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False)
]
types = [type_desc(ptx_type_suffix=".b32", val_constraint="r", name="int32"),
type_desc(ptx_type_suffix=".b64", val_constraint="l", name="int64"),
types = [type_desc(ptx_type_suffix=".s32", val_constraint="r", name="int32"),
type_desc(ptx_type_suffix=".s64", val_constraint="l", name="int64"),
type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"),
type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")]
# header
Expand Down

0 comments on commit c6706ab

Please sign in to comment.