From 1b81c4dad9e0e7a2711a09ee72e70efaf3014bbe Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 14 Aug 2023 10:51:00 +0200 Subject: [PATCH 1/4] Bump GPUCompiler. --- Manifest.toml | 4 ++-- Project.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index bb0c3de750..0f1ab55a2f 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -142,9 +142,9 @@ version = "0.1.5" [[GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3" +git-tree-sha1 = "8de395b1243771bbb79ac832ec96c7def7a4586f" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.21.4" +version = "0.22.0" [[InlineStrings]] deps = ["Parsers"] diff --git a/Project.toml b/Project.toml index 24c34fed97..64effc9529 100644 --- a/Project.toml +++ b/Project.toml @@ -46,7 +46,7 @@ Crayons = "4" DataFrames = "1" ExprTools = "0.1" GPUArrays = "8.6" -GPUCompiler = "0.21" +GPUCompiler = "0.22" KernelAbstractions = "0.9.2" LLVM = "6" Preferences = "1" From aff8cd3e1c4c4ecf4d943e893d592d9624ffb21b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 14 Aug 2023 10:58:32 +0200 Subject: [PATCH 2/4] Add support for fastmath kwarg. --- src/compiler/execution.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index ad56d0766c..4850bdf343 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -6,7 +6,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp ## high-level @cuda interface const MACRO_KWARGS = [:dynamic, :launch] -const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath] const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :shmem, :stream] From b85130d053a4f40c57aa0cfe0cd9e94659d4e4a0 Mon Sep 17 00:00:00 2001 From: Zentrik Date: Tue, 15 Aug 2023 09:13:49 +0200 Subject: [PATCH 3/4] Add test. --- src/compiler/execution.jl | 1 + test/core/codegen.jl | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 4850bdf343..fa08b077d4 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -306,6 +306,7 @@ The following keyword arguments are supported: supported on LLVM 4.0+) - `name`: override the name that the kernel will have in the generated code - `always_inline`: inline all function calls in the kernel +- `fastmath`: use less precise square roots and flush denormals The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/test/core/codegen.jl b/test/core/codegen.jl index e948972407..d4b044cdff 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -157,6 +157,29 @@ end @test !occursin(".local", asm) end +@testset "fastmath" begin + function sqrt_kernel(x) + i = threadIdx().x + @inbounds x[i] = sqrt(x[i]) + return + end + + function div_kernel(x) + i = threadIdx().x + @fastmath @inbounds x[i] = 1 / x[i] + return + end + + asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}})) + @test occursin("sqrt.r", asm) + + asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) + @test occursin("sqrt.approx.ftz", asm) + + asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) + @test occursin("div.approx.ftz", asm) +end + end ############################################################################################ From a2d3219578006e7795611fb896a3b12c63a7c586 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 17 Aug 2023 17:15:17 +0200 Subject: [PATCH 4/4] Disable sqrt fast math test on CUDA 11.0. --- test/core/codegen.jl | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/test/core/codegen.jl b/test/core/codegen.jl index d4b044cdff..9ca772d0f8 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -158,26 +158,29 @@ end end @testset "fastmath" begin - function sqrt_kernel(x) - i = threadIdx().x - @inbounds x[i] = sqrt(x[i]) - return - end - function div_kernel(x) i = threadIdx().x @fastmath @inbounds x[i] = 1 / x[i] return end - asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}})) - @test occursin("sqrt.r", asm) - - asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) - @test occursin("sqrt.approx.ftz", asm) - asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) @test occursin("div.approx.ftz", asm) + + # libdevice only contains fast math versions of sqrt for CUDA 11.1+ + if CUDA.runtime_version() >= v"11.1" + function sqrt_kernel(x) + i = threadIdx().x + @inbounds x[i] = sqrt(x[i]) + return + end + + asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}})) + @test occursin("sqrt.r", asm) + + asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) + @test occursin("sqrt.approx.ftz", asm) + end end end