From 1b81c4dad9e0e7a2711a09ee72e70efaf3014bbe Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 14 Aug 2023 10:51:00 +0200
Subject: [PATCH 1/4] Bump GPUCompiler.

---
 Manifest.toml | 4 ++--
 Project.toml  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index bb0c3de750..0f1ab55a2f 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -142,9 +142,9 @@ version = "0.1.5"
 
 [[GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3"
+git-tree-sha1 = "8de395b1243771bbb79ac832ec96c7def7a4586f"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.21.4"
+version = "0.22.0"
 
 [[InlineStrings]]
 deps = ["Parsers"]
diff --git a/Project.toml b/Project.toml
index 24c34fed97..64effc9529 100644
--- a/Project.toml
+++ b/Project.toml
@@ -46,7 +46,7 @@ Crayons = "4"
 DataFrames = "1"
 ExprTools = "0.1"
 GPUArrays = "8.6"
-GPUCompiler = "0.21"
+GPUCompiler = "0.22"
 KernelAbstractions = "0.9.2"
 LLVM = "6"
 Preferences = "1"

From aff8cd3e1c4c4ecf4d943e893d592d9624ffb21b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 14 Aug 2023 10:58:32 +0200
Subject: [PATCH 2/4] Add support for fastmath kwarg.

---
 src/compiler/execution.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index ad56d0766c..4850bdf343 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -6,7 +6,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp
 ## high-level @cuda interface
 
 const MACRO_KWARGS = [:dynamic, :launch]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath]
 const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :shmem, :stream]
 
 

From b85130d053a4f40c57aa0cfe0cd9e94659d4e4a0 Mon Sep 17 00:00:00 2001
From: Zentrik <Zentrik@users.noreply.github.com>
Date: Tue, 15 Aug 2023 09:13:49 +0200
Subject: [PATCH 3/4] Add test.

---
 src/compiler/execution.jl |  1 +
 test/core/codegen.jl      | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 4850bdf343..fa08b077d4 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -306,6 +306,7 @@ The following keyword arguments are supported:
   supported on LLVM 4.0+)
 - `name`: override the name that the kernel will have in the generated code
 - `always_inline`: inline all function calls in the kernel
+- `fastmath`: use less precise square roots and flush denormals
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index e948972407..d4b044cdff 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -157,6 +157,29 @@ end
     @test !occursin(".local", asm)
 end
 
+@testset "fastmath" begin
+    function sqrt_kernel(x)
+        i = threadIdx().x
+        @inbounds x[i] = sqrt(x[i])
+        return
+    end
+
+    function div_kernel(x)
+        i = threadIdx().x
+        @fastmath @inbounds x[i] = 1 / x[i]
+        return
+    end
+
+    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
+    @test occursin("sqrt.r", asm)
+
+    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
+    @test occursin("sqrt.approx.ftz", asm)
+
+    asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
+    @test occursin("div.approx.ftz", asm)
+end
+
 end
 
 ############################################################################################

From a2d3219578006e7795611fb896a3b12c63a7c586 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 17 Aug 2023 17:15:17 +0200
Subject: [PATCH 4/4] Disable sqrt fast math test on CUDA 11.0.

---
 test/core/codegen.jl | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index d4b044cdff..9ca772d0f8 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -158,26 +158,29 @@ end
 end
 
 @testset "fastmath" begin
-    function sqrt_kernel(x)
-        i = threadIdx().x
-        @inbounds x[i] = sqrt(x[i])
-        return
-    end
-
     function div_kernel(x)
         i = threadIdx().x
         @fastmath @inbounds x[i] = 1 / x[i]
         return
     end
 
-    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
-    @test occursin("sqrt.r", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
-    @test occursin("sqrt.approx.ftz", asm)
-
     asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
     @test occursin("div.approx.ftz", asm)
+
+    # libdevice only contains fast math versions of sqrt for CUDA 11.1+
+    if CUDA.runtime_version() >= v"11.1"
+        function sqrt_kernel(x)
+            i = threadIdx().x
+            @inbounds x[i] = sqrt(x[i])
+            return
+        end
+
+        asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
+        @test occursin("sqrt.r", asm)
+
+        asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
+        @test occursin("sqrt.approx.ftz", asm)
+    end
 end
 
 end