diff --git a/.gitignore b/.gitignore index c181d1f8..ba39cc53 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -test/Manifest.toml +Manifest.toml diff --git a/configs/configs.jl b/configs/configs.jl index c88b2d8d..0bc12271 100644 --- a/configs/configs.jl +++ b/configs/configs.jl @@ -3,6 +3,7 @@ using GemmKernels using LinearAlgebra using ForwardDiff +using Octavian struct Configuration name # Human-readable name of the configuration. @@ -238,10 +239,10 @@ macro get_wmma_config() CD_type, transpose_a, transpose_b, - mul!, + Octavian.matmul!, Epilogue.Default(), verify_default, - Kernel.matmul_pipelined, + kernel, wmma_baseline) end end) end @@ -520,7 +521,8 @@ function get_configs() [2, 2, 1], [1, 1, 2], [2, 2, 2]], [[2048, 2048, 2048]]), - zero_c in [false] + zero_c in [false], + kernel in [Kernel.matmul_pipelined] push!(rv, @get_wmma_config) end diff --git a/src/kernel.jl b/src/kernel.jl index ef6f9266..d9a3f4b1 100644 --- a/src/kernel.jl +++ b/src/kernel.jl @@ -317,6 +317,7 @@ function matmul_pipelined(conf::GemmKernels.Config, a, b, c, d, @loopinfo unroll for j = 1 : num_fragments_n b_tile = translate_offset(warp_tile.KN, (K = 0, N = (j-1)*conf.compute_op_shape.N)) + @cuassert ((b_tile.base.M + b_tile.offset.M) < conf.matmul_shape.M) && ((b_tile.base.K + b_tile.offset.K) < conf.matmul_shape.K) @inbounds @immutable b_frags[nxt_stage, j] = transf_sh2rf_b(Operator.load_b(conf.operator, conf.shared_b_layout, shmem_b, b_tile), b_tile) end diff --git a/test/Project.toml b/test/Project.toml index 8828b9af..8d1be0aa 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -5,5 +5,6 @@ Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" XUnit = "3e3c03f2-1a94-11e9-2981-050a4ca824ab" diff --git a/tuning/Project.toml b/tuning/Project.toml new file mode 100644 index 00000000..aa266efe --- /dev/null +++ b/tuning/Project.toml @@ -0,0 +1,5 @@ +[deps] +Hyperopt = "93e5fe13-2215-51db-baaf-2e9a34fb2712" +Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +PythonPlot = "274fc56d-3b97-40fa-a1cd-1b4a50311bf9" diff --git a/tuning/tune-wmma.jl b/tuning/tune-wmma.jl new file mode 100644 index 00000000..2b5ebc7d --- /dev/null +++ b/tuning/tune-wmma.jl @@ -0,0 +1,213 @@ +using CUDA, GemmKernels +using Hyperopt +using Plots +pythonplot() + +const NUM_SAMPLES = 250 +const NUM_SAMPLES_PLOT = 250 + +include("../configs/configs.jl") + +AB_type = Float16 +CD_type = Float32 + +zero_c = true + +OP_M, OP_N, OP_K = 16, 16, 16 + +markershapes = Dict( + "NN" => :circle, + "TT" => :cross, + "TN" => :diamond, + "NT" => :dtriangle, + ) + +function print_counters(counters) + count_str(categ) = "$(counters[categ]) ($(round(100*counters[categ]/counters["total"]; digits=1))%)" + + println("Total: $(counters["total"]) configurations") + println(repeat("-", 100)) + println("Skipped due to invalid GemmKernels config: $(count_str("invalid_config"))") + println("Produced incorrect result: $(count_str("invalid_result"))") + println("Threw an error: $(count_str("error"))") + println("Successful runs: $(count_str("success"))") +end + +function optimise(transpose_a, transpose_b) + M = N = K = 4096 + + BLOCK_M, BLOCK_N, BLOCK_K = 128, 128, 64 + WARPS_M = 4 + WARPS_N = 2 + kernel = Kernel.matmul_singlestage + cf = @get_wmma_config + c_h, a, b, c, d = generate_inputs(cf) + + @info "Starting hyperopt..." + + counters = Dict( + "total" => 0, + "invalid_config" => 0, + "invalid_result" => 0, + "error" => 0, + "success" => 0, + ) + + ho = @hyperopt for i = 1000, + BLOCK_M = 2 .^ (1:8), + BLOCK_N = 2 .^ (1:8), + BLOCK_K = 2 .^ (1:8), + WARPS_M = 2 .^ (0:3), + WARPS_N = 2 .^ (0:3), + kernel in [Kernel.matmul_singlestage, Kernel.matmul_pipelined] + + counters["total"] += 1 + + try + cf = @get_wmma_config + catch err + if isa(err, GemmKernels.ConfigError) + counters["invalid_config"] += 1 + return Inf + end + end + + @info "Trying configuration: $(cf.config)" + + d .= 0 + + try + run_gemm(cf, a, b, c, d) + catch err + if isa(err, GemmKernels.ConfigError) + counters["invalid_config"] += 1 + return Inf + end + + counters["error"] += 1 + + if isa(err, CuError) + @error "Configuration failed: $(cf.config)" + rethrow() + end + + @info "Skipping configuration: $(cf.config)\n" * sprint(Base.showerror, err) + return Inf + end + + if !verify(cf, c_h, d) + @warn "Configuration produced invalid result: $(cf.config)" + counters["invalid_result"] += 1 + return Inf + end + + times = [] + + try + for i in 1:NUM_SAMPLES + prof = CUDA.@profile run_gemm(cf, a, b, c, d) + push!(times, sum(prof.device[!, "stop"] - prof.device[!, "start"])) + end + catch err + counters["error"] += 1 + + if isa(err, CuError) + @error "Configuration failed: $(cf.config)" + rethrow() + end + + @info "Skipping configuration: $(cf.config)\n" * sprint(Base.showerror, err) + return Inf + end + + counters["success"] += 1 + return minimum(times) + end + + print_counters(counters) + + ho, counters +end + +get_label(transpose_a, transpose_b) = "$(transpose_a ? "T" : "N")$(transpose_b ? "T" : "N")" + +function make_plot(BLOCK_M, BLOCK_N, BLOCK_K, WARPS_M, WARPS_N, kernel, transpose_a, transpose_b) + label = get_label(transpose_a, transpose_b) + + N_vals = 2 .^ (7:14) + gemmkernels = [] + cublas = [] + + for N in N_vals + @show N + M = K = N + + cf = @get_wmma_config + c_h, a, b, c, d = generate_inputs(cf) + + samples = [] + + for i in 1:NUM_SAMPLES_PLOT + prof = CUDA.@profile run_gemm(cf, a, b, c, d) + push!(samples, sum(prof.device[!, "stop"] - prof.device[!, "start"])) + end + + push!(gemmkernels, minimum(samples)) + + samples = [] + + for i in 1:NUM_SAMPLES_PLOT + prof = CUDA.@profile run_baseline(cf, a, b, c, d) + push!(samples, sum(prof.device[!, "stop"] - prof.device[!, "start"])) + end + + push!(cublas, minimum(samples)) + end + + ratios = 100 .* cublas ./ gemmkernels + + plot!(N_vals, ratios, label=label, markershape=markershapes[label], xscale=:log2, ylims=(0, max(100, ratios...))) + title!("$AB_type x $AB_type = $CD_type") + xlabel!("Matrix size [-]") + ylabel!("Performance relative to cuBLAS [%]") +end + +function main() + hos = Dict() + + total_counters = Dict( + "total" => 0, + "invalid_config" => 0, + "invalid_result" => 0, + "error" => 0, + "success" => 0, + ) + + for transpose_a in [false, true], + transpose_b in [false, true] + hos[(transpose_a, transpose_b)], counters = optimise(transpose_a, transpose_b) + total_counters = Dict(k => total_counters[k] + counters[k] for k in keys(counters)) + end + + println(repeat("=", 100)) + println("Overall configurations:") + println(repeat("=", 100)) + + print_counters(total_counters) + + println("Optimal parameters:") + + for transpose_a in [false, true], + transpose_b in [false, true] + println("$(get_label(transpose_a, transpose_b)): $(hos[(transpose_a, transpose_b)].minimizer)") + end + + for transpose_a in [false, true], + transpose_b in [false, true] + make_plot(hos[(transpose_a, transpose_b)].minimizer..., transpose_a, transpose_b) + end + + savefig("plot.pdf") +end + +isinteractive() || main()