Skip to content

Commit

Permalink
remove unnecessary const casts
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Oct 10, 2023
1 parent 6505d06 commit 33b1de8
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 76 deletions.
104 changes: 40 additions & 64 deletions cuda/components/memory.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,12 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"r"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"r"(result)
: "memory");
#endif
Expand Down Expand Up @@ -159,12 +159,12 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"l"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"l"(result)
: "memory");
#endif
Expand Down Expand Up @@ -194,12 +194,12 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"f"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"f"(result)
: "memory");
#endif
Expand Down Expand Up @@ -231,12 +231,12 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"d"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"d"(result)
: "memory");
#endif
Expand Down Expand Up @@ -267,12 +267,12 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result)
membar_acq_rel_shared();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"r"(result)
: "memory");
#else
asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"r"(result)
: "memory");
#endif
Expand Down Expand Up @@ -303,12 +303,12 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result)
membar_acq_rel_shared();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"l"(result)
: "memory");
#else
asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"l"(result)
: "memory");
#endif
Expand Down Expand Up @@ -339,12 +339,12 @@ __device__ __forceinline__ void store_release_shared(float* ptr, float result)
membar_acq_rel_shared();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"f"(result)
: "memory");
#else
asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"f"(result)
: "memory");
#endif
Expand Down Expand Up @@ -377,12 +377,12 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result)
membar_acq_rel_shared();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"d"(result)
: "memory");
#else
asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"(
convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"d"(result)
: "memory");
#endif
Expand Down Expand Up @@ -411,12 +411,10 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr)
__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast<int32*>(ptr)),
"r"(result)
asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#else
asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(const_cast<int32*>(ptr)),
"r"(result)
asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#endif
}
Expand Down Expand Up @@ -444,12 +442,10 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr)
__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast<int64*>(ptr)),
"l"(result)
asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#else
asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(const_cast<int64*>(ptr)),
"l"(result)
asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#endif
}
Expand Down Expand Up @@ -477,12 +473,10 @@ __device__ __forceinline__ float load_relaxed(const float* ptr)
__device__ __forceinline__ void store_relaxed(float* ptr, float result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast<float*>(ptr)),
"f"(result)
asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
: "memory");
#else
asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(const_cast<float*>(ptr)),
"f"(result)
asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result)
: "memory");
#endif
}
Expand Down Expand Up @@ -510,12 +504,10 @@ __device__ __forceinline__ double load_relaxed(const double* ptr)
__device__ __forceinline__ void store_relaxed(double* ptr, double result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast<double*>(ptr)),
"d"(result)
asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
: "memory");
#else
asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(const_cast<double*>(ptr)),
"d"(result)
asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result)
: "memory");
#endif
}
Expand Down Expand Up @@ -544,12 +536,10 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result)
{
membar_acq_rel();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast<int32*>(ptr)),
"r"(result)
asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#else
asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(const_cast<int32*>(ptr)),
"r"(result)
asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#endif
}
Expand Down Expand Up @@ -578,12 +568,10 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result)
{
membar_acq_rel();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast<int64*>(ptr)),
"l"(result)
asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#else
asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(const_cast<int64*>(ptr)),
"l"(result)
asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#endif
}
Expand Down Expand Up @@ -612,12 +600,10 @@ __device__ __forceinline__ void store_release(float* ptr, float result)
{
membar_acq_rel();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast<float*>(ptr)),
"f"(result)
asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
: "memory");
#else
asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(const_cast<float*>(ptr)),
"f"(result)
asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result)
: "memory");
#endif
}
Expand Down Expand Up @@ -646,12 +632,10 @@ __device__ __forceinline__ void store_release(double* ptr, double result)
{
membar_acq_rel();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast<double*>(ptr)),
"d"(result)
asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
: "memory");
#else
asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(const_cast<double*>(ptr)),
"d"(result)
asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result)
: "memory");
#endif
}
Expand Down Expand Up @@ -686,14 +670,12 @@ __device__ __forceinline__ void store_relaxed_shared(
auto imag_result = result.imag();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"(
convert_generic_ptr_to_smem_ptr(
const_cast<thrust::complex<float>*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"f"(real_result), "f"(imag_result)
: "memory");
#else
asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"(
convert_generic_ptr_to_smem_ptr(
const_cast<thrust::complex<float>*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"f"(real_result), "f"(imag_result)
: "memory");
#endif
Expand Down Expand Up @@ -729,14 +711,12 @@ __device__ __forceinline__ void store_relaxed_shared(
auto imag_result = result.imag();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"(
convert_generic_ptr_to_smem_ptr(
const_cast<thrust::complex<double>*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"d"(real_result), "d"(imag_result)
: "memory");
#else
asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"(
convert_generic_ptr_to_smem_ptr(
const_cast<thrust::complex<double>*>(ptr))),
convert_generic_ptr_to_smem_ptr(ptr)),
"d"(real_result), "d"(imag_result)
: "memory");
#endif
Expand Down Expand Up @@ -769,13 +749,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<float>* ptr,
auto real_result = result.real();
auto imag_result = result.imag();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(
const_cast<thrust::complex<float>*>(ptr)),
asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
"f"(real_result), "f"(imag_result)
: "memory");
#else
asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(
const_cast<thrust::complex<float>*>(ptr)),
asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
"f"(real_result), "f"(imag_result)
: "memory");
#endif
Expand Down Expand Up @@ -808,13 +786,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<double>* ptr,
auto real_result = result.real();
auto imag_result = result.imag();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(
const_cast<thrust::complex<double>*>(ptr)),
asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
"d"(real_result), "d"(imag_result)
: "memory");
#else
asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(
const_cast<thrust::complex<double>*>(ptr)),
asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
"d"(real_result), "d"(imag_result)
: "memory");
#endif
Expand Down
28 changes: 16 additions & 12 deletions dev_tools/scripts/generate_cuda_memory_ptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ class type_desc:

memory_spaces = [
space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared",
ptr_expr="convert_generic_ptr_to_smem_ptr(const_cast<{typename}*>(ptr))", ptr_constraint="r"),
space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="const_cast<{typename}*>(ptr)", ptr_constraint="l")]
ptr_expr="convert_generic_ptr_to_smem_ptr({ptr})", ptr_constraint="r"),
space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="{ptr}", ptr_constraint="l")]
memory_orderings = [
ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed",
ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True),
Expand Down Expand Up @@ -150,20 +150,22 @@ class type_desc:
for o in memory_orderings:
for t in types:
membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();"
ptr_expr = s.ptr_expr.format(typename=t.name)
const_ptr_expr = s.ptr_expr.format(
ptr=f"const_cast<{t.name}*>(ptr)")
mut_ptr_expr = s.ptr_expr.format(ptr="ptr")
print(f"""
__device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr)
{{
{t.name} result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
: "={t.val_constraint}"(result)
: "{s.ptr_constraint}"({ptr_expr})
: "{s.ptr_constraint}"({const_ptr_expr})
: "memory");
#else
asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
: "={t.val_constraint}"(result)
: "{s.ptr_constraint}"({ptr_expr})
: "{s.ptr_constraint}"({const_ptr_expr})
: "memory");
#endif
{membar_expression}
Expand All @@ -176,11 +178,11 @@ class type_desc:
{membar_expression}
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
:: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result)
:: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
: "memory");
#else
asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
:: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result)
:: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
: "memory");
#endif
}}
Expand All @@ -191,7 +193,9 @@ class type_desc:
type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")]
for s in memory_spaces:
for t in types:
ptr_expr = s.ptr_expr.format(typename=f"thrust::complex<{t.name}>")
const_ptr_expr = s.ptr_expr.format(
ptr=f"const_cast<thrust::complex<{t.name}>*>(ptr)")
mut_ptr_expr = s.ptr_expr.format(ptr="ptr")
print(f"""
__device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr)
{{
Expand All @@ -200,12 +204,12 @@ class type_desc:
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
: "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
: "{s.ptr_constraint}"({ptr_expr})
: "{s.ptr_constraint}"({const_ptr_expr})
: "memory");
#else
asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
: "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
: "{s.ptr_constraint}"({ptr_expr})
: "{s.ptr_constraint}"({const_ptr_expr})
: "memory");
#endif
return thrust::complex<{t.name}>{{real_result, imag_result}};
Expand All @@ -218,11 +222,11 @@ class type_desc:
auto imag_result = result.imag();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
:: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
:: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
: "memory");
#else
asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
:: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
:: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
: "memory");
#endif
}}
Expand Down

0 comments on commit 33b1de8

Please sign in to comment.