From 33b1de85592a70612ca1fba9da12739cf44965f6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 25 Sep 2023 15:59:26 +0200 Subject: [PATCH] remove unnecessary const casts --- cuda/components/memory.cuh | 104 +++++++----------- dev_tools/scripts/generate_cuda_memory_ptx.py | 28 +++-- 2 files changed, 56 insertions(+), 76 deletions(-) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 844fca6adf4..af3a0e838ea 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -124,12 +124,12 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #endif @@ -159,12 +159,12 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #endif @@ -194,12 +194,12 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #endif @@ -231,12 +231,12 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #endif @@ -267,12 +267,12 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #endif @@ -303,12 +303,12 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #endif @@ -339,12 +339,12 @@ __device__ __forceinline__ void store_release_shared(float* ptr, float result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #else asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #endif @@ -377,12 +377,12 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #else asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #endif @@ -411,12 +411,10 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -444,12 +442,10 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } @@ -477,12 +473,10 @@ __device__ __forceinline__ float load_relaxed(const float* ptr) __device__ __forceinline__ void store_relaxed(float* ptr, float result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #endif } @@ -510,12 +504,10 @@ __device__ __forceinline__ double load_relaxed(const double* ptr) __device__ __forceinline__ void store_relaxed(double* ptr, double result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #endif } @@ -544,12 +536,10 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -578,12 +568,10 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } @@ -612,12 +600,10 @@ __device__ __forceinline__ void store_release(float* ptr, float result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #else - asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #endif } @@ -646,12 +632,10 @@ __device__ __forceinline__ void store_release(double* ptr, double result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #else - asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #endif } @@ -686,14 +670,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -729,14 +711,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #endif @@ -769,13 +749,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr), "f"(real_result), "f"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -808,13 +786,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr), "d"(real_result), "d"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr), "d"(real_result), "d"(imag_result) : "memory"); #endif diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index dae5f6c3a59..4cbe05361c1 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -29,8 +29,8 @@ class type_desc: memory_spaces = [ space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared", - ptr_expr="convert_generic_ptr_to_smem_ptr(const_cast<{typename}*>(ptr))", ptr_constraint="r"), - space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="const_cast<{typename}*>(ptr)", ptr_constraint="l")] + ptr_expr="convert_generic_ptr_to_smem_ptr({ptr})", ptr_constraint="r"), + space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="{ptr}", ptr_constraint="l")] memory_orderings = [ ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True), @@ -150,7 +150,9 @@ class type_desc: for o in memory_orderings: for t in types: membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" - ptr_expr = s.ptr_expr.format(typename=t.name) + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast<{t.name}*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") print(f""" __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ @@ -158,12 +160,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #else asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #endif {membar_expression} @@ -176,11 +178,11 @@ class type_desc: {membar_expression} #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) : "memory"); #else asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) : "memory"); #endif }} @@ -191,7 +193,9 @@ class type_desc: type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] for s in memory_spaces: for t in types: - ptr_expr = s.ptr_expr.format(typename=f"thrust::complex<{t.name}>") + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") print(f""" __device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr) {{ @@ -200,12 +204,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #else asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #endif return thrust::complex<{t.name}>{{real_result, imag_result}}; @@ -218,11 +222,11 @@ class type_desc: auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #else asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #endif }}