From a173ac2b26dc3523e6b4e6207961357c30b94668 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Wed, 6 Sep 2023 16:12:31 +0200 Subject: [PATCH] Adding KernelAbstractions tooling for Molly and tests --- .github/workflows/CI.yml | 1 + .gitignore | 2 +- Project.toml | 7 +- benchmark/benchmarks.jl | 52 ++-- benchmark/protein.jl | 22 +- src/cuda.jl => ext/MollyCUDAExt.jl | 11 + ext/MollyGLMakieExt.jl | 2 +- ext/MollyPythonCallExt.jl | 6 +- src/Molly.jl | 5 +- src/analysis.jl | 1 + src/coupling.jl | 8 +- src/energy.jl | 11 +- src/force.jl | 9 +- src/interactions/implicit_solvent.jl | 120 +++++----- src/kernels.jl | 345 +++++++++++++++++++++++++++ src/neighbors.jl | 43 ++-- src/setup.jl | 98 ++++---- src/simulators.jl | 20 +- src/spatial.jl | 17 +- src/types.jl | 48 ++-- test/Project.toml | 1 + test/basic.jl | 39 ++- test/energy_conservation.jl | 12 +- test/minimization.jl | 14 +- test/protein.jl | 14 +- test/runtests.jl | 24 +- test/simulation.jl | 94 ++++---- 27 files changed, 709 insertions(+), 317 deletions(-) rename src/cuda.jl => ext/MollyCUDAExt.jl (99%) create mode 100644 src/kernels.jl diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 7bb822ad..3145e136 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -29,6 +29,7 @@ jobs: - NotGradients - Gradients steps: + - run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE" - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v2 with: diff --git a/.gitignore b/.gitignore index 293442ed..697b7041 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ *.jl.*.cov *.jl.mem docs/build -/Manifest.toml +*Manifest.toml benchmark/tune.json benchmark/results .vscode/settings.json diff --git a/Project.toml b/Project.toml index 1adea086..262fa0b2 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a" AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1" BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864" Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" @@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" FLoops = "cc61a311-1640-44b5-9fba-1b764f453329" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884" @@ -32,6 +33,7 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a" UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" [weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" @@ -40,6 +42,7 @@ PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" [extensions] MollyEnzymeExt = "Enzyme" +MollyCUDAExt = "CUDA" MollyGLMakieExt = ["GLMakie", "Colors"] MollyKernelDensityExt = "KernelDensity" MollyPythonCallExt = "PythonCall" @@ -61,7 +64,9 @@ Enzyme = "0.13" EzXML = "1" FLoops = "0.2" GLMakie = "0.8, 0.9, 0.10" +GPUArrays = "10" Graphs = "1.8" +KernelAbstractions = "0.9" KernelDensity = "0.5, 0.6" LinearAlgebra = "1.9" NearestNeighbors = "0.4" diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 08e6c5b4..86b0deea 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at const starting_coords_f32 = [Float32.(c) for c in starting_coords] const starting_velocities_f32 = [Float32.(c) for c in starting_velocities] -function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) +function test_sim(nl::Bool, parallel::Bool, f32::Bool, + ArrayType::Type{AT}) where AT <: AbstractArray n_atoms = 400 n_steps = 200 atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol" @@ -72,9 +73,9 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) r0 = f32 ? 0.2f0u"nm" : 0.2u"nm" bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)] specific_inter_lists = (InteractionList2Atoms( - gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)), - gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)), - gpu ? CuArray(bonds) : bonds, + ArrayType(Int32.(collect(1:2:n_atoms))), + ArrayType(Int32.(collect(2:2:n_atoms))), + ArrayType(bonds), ),) neighbor_finder = NoNeighborFinder() @@ -82,24 +83,17 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),) if nl neighbor_finder = DistanceNeighborFinder( - eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms), + eligible=ArrayType(trues(n_atoms, n_atoms)), n_steps=10, dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm", ) pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),) end - if gpu - coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords)) - velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities)) - atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", - ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]) - else - coords = copy(f32 ? starting_coords_f32 : starting_coords) - velocities = copy(f32 ? starting_velocities_f32 : starting_velocities) - atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", - ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms] - end + coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords)) + velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities)) + atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", + ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]) sys = System( atoms=atoms, @@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) end runs = [ - ("CPU" , [false, false, false, false]), - ("CPU f32" , [false, false, true , false]), - ("CPU NL" , [true , false, false, false]), - ("CPU f32 NL", [true , false, true , false]), + ("CPU" , [false, false, false, Array]), + ("CPU f32" , [false, false, true , Array]), + ("CPU NL" , [true , false, false, Array]), + ("CPU f32 NL", [true , false, true , Array]), ] if run_parallel_tests - push!(runs, ("CPU parallel" , [false, true , false, false])) - push!(runs, ("CPU parallel f32" , [false, true , true , false])) - push!(runs, ("CPU parallel NL" , [true , true , false, false])) - push!(runs, ("CPU parallel f32 NL", [true , true , true , false])) + push!(runs, ("CPU parallel" , [false, true , false, Array])) + push!(runs, ("CPU parallel f32" , [false, true , true , Array])) + push!(runs, ("CPU parallel NL" , [true , true , false, Array])) + push!(runs, ("CPU parallel f32 NL", [true , true , true , Array])) end -if run_gpu_tests - push!(runs, ("GPU" , [false, false, false, true])) - push!(runs, ("GPU f32" , [false, false, true , true])) - push!(runs, ("GPU NL" , [true , false, false, true])) - push!(runs, ("GPU f32 NL", [true , false, true , true])) +if run_cuda_tests + push!(runs, ("GPU" , [false, false, false, CuArray])) + push!(runs, ("GPU f32" , [false, false, true , CuArray])) + push!(runs, ("GPU NL" , [true , false, false, CuArray])) + push!(runs, ("GPU f32 NL", [true , false, true , CuArray])) end for (name, args) in runs diff --git a/benchmark/protein.jl b/benchmark/protein.jl index 30f512c0..2bef6ff5 100644 --- a/benchmark/protein.jl +++ b/benchmark/protein.jl @@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data") const ff_dir = joinpath(data_dir, "force_fields") const openmm_dir = joinpath(data_dir, "openmm_6mrr") -function setup_system(gpu::Bool, f32::Bool, units::Bool) +function setup_system(ArrayType::AbstractArray, f32::Bool, units::Bool) T = f32 ? Float32 : Float64 ff = MolecularForceField( T, @@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool) sys = System( joinpath(data_dir, "6mrr_equil.pdb"), ff; - velocities=gpu ? CuArray(velocities) : velocities, + velocities=ArrayType(velocities), units=units, gpu=gpu, dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff), @@ -42,15 +42,15 @@ end runs = [ # run_name gpu parr f32 units - ("CPU 1 thread" , false, false, false, true ), - ("CPU 1 thread f32" , false, false, true , true ), - ("CPU 1 thread f32 nounits" , false, false, true , false), - ("CPU $n_threads threads" , false, true , false, true ), - ("CPU $n_threads threads f32" , false, true , true , true ), - ("CPU $n_threads threads f32 nounits", false, true , true , false), - ("GPU" , true , false, false, true ), - ("GPU f32" , true , false, true , true ), - ("GPU f32 nounits" , true , false, true , false), + ("CPU 1 thread" , Array, false, false, true ), + ("CPU 1 thread f32" , Array, false, true , true ), + ("CPU 1 thread f32 nounits" , Array, false, true , false), + ("CPU $n_threads threads" , Array, true , false, true ), + ("CPU $n_threads threads f32" , Array, true , true , true ), + ("CPU $n_threads threads f32 nounits", Array, true , true , false), + ("GPU" , CuArray, false, false, true ), + ("GPU f32" , CuArray, false, true , true ), + ("GPU f32 nounits" , CuArray, false, true , false), ] for (run_name, gpu, parallel, f32, units) in runs diff --git a/src/cuda.jl b/ext/MollyCUDAExt.jl similarity index 99% rename from src/cuda.jl rename to ext/MollyCUDAExt.jl index 65ba1ede..90ad4897 100644 --- a/src/cuda.jl +++ b/ext/MollyCUDAExt.jl @@ -1,3 +1,12 @@ +module MollyCUDAExt + +using Molly +using CUDA +using ChainRulesCore +using Atomix + +CUDA.Const(nl::Molly.NoNeighborList) = nl + # CUDA.jl kernels const WARPSIZE = UInt32(32) @@ -513,3 +522,5 @@ function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_v end return nothing end + +end # module diff --git a/ext/MollyGLMakieExt.jl b/ext/MollyGLMakieExt.jl index 5509ddde..fa7a4909 100644 --- a/ext/MollyGLMakieExt.jl +++ b/ext/MollyGLMakieExt.jl @@ -6,8 +6,8 @@ module MollyGLMakieExt using Molly import AtomsBase using GLMakie -using Colors using Unitful +using Colors using LinearAlgebra diff --git a/ext/MollyPythonCallExt.jl b/ext/MollyPythonCallExt.jl index e1afaeb8..9d0a26bf 100644 --- a/ext/MollyPythonCallExt.jl +++ b/ext/MollyPythonCallExt.jl @@ -6,7 +6,7 @@ module MollyPythonCallExt using Molly using PythonCall import AtomsCalculators -using CUDA +using GPUArrays using StaticArrays using Unitful @@ -91,7 +91,7 @@ end uconvert_vec(x...) = uconvert.(x...) -function AtomsCalculators.forces(sys::System{D, G, T}, +function AtomsCalculators.forces(sys::System{D, AT, T}, ase_calc::ASECalculator; kwargs...) where {D, G, T} update_ase_calc!(ase_calc, sys) @@ -105,7 +105,7 @@ function AtomsCalculators.forces(sys::System{D, G, T}, else fs_unit = uconvert_vec.(sys.force_units, fs * u"eV/Å") end - return G ? CuArray(fs_unit) : fs_unit + return AT <: AbstractGPUArray ? AT(fs_unit) : fs_unit end function AtomsCalculators.potential_energy(sys::System{D, G, T}, diff --git a/src/Molly.jl b/src/Molly.jl index 19664deb..08026b18 100644 --- a/src/Molly.jl +++ b/src/Molly.jl @@ -11,7 +11,8 @@ import BioStructures # Imported to avoid clashing names using CellListMap import Chemfiles using Combinatorics -using CUDA +using KernelAbstractions +using GPUArrays using DataStructures using Distances using Distributions @@ -34,7 +35,7 @@ include("types.jl") include("units.jl") include("spatial.jl") include("cutoffs.jl") -include("cuda.jl") +include("kernels.jl") include("force.jl") include("interactions/lennard_jones.jl") include("interactions/soft_sphere.jl") diff --git a/src/analysis.jl b/src/analysis.jl index 01429b5a..08e68d2a 100644 --- a/src/analysis.jl +++ b/src/analysis.jl @@ -90,6 +90,7 @@ function hydrodynamic_radius(coords::AbstractArray{SVector{D, T}}, boundary) whe n_atoms = length(coords) diag_cpu = Diagonal(ones(T, n_atoms)) diag = isa(coords, CuArray) ? CuArray(diag_cpu) : diag_cpu + diag = isa(coords, AbstractGPUArray) ? get_array_type(coords)((diag_cpu)) : diag_cpu dists = distances(coords, boundary) .+ diag sum_inv_dists = sum(inv.(dists)) - sum(inv(diag)) inv_R_hyd = sum_inv_dists / (2 * n_atoms^2) diff --git a/src/coupling.jl b/src/coupling.jl index 5168580c..1cca8a01 100644 --- a/src/coupling.jl +++ b/src/coupling.jl @@ -58,9 +58,9 @@ struct AndersenThermostat{T, C} coupling_const::C end -function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat, sim, +function apply_coupling!(sys::System{D, AT}, thermostat::AndersenThermostat, sim, neighbors=nothing, step_n::Integer=0; - n_threads::Integer=Threads.nthreads()) where D + n_threads::Integer=Threads.nthreads()) where {D, AT} for i in eachindex(sys) if rand() < (sim.dt / thermostat.coupling_const) sys.velocities[i] = random_velocity(mass(sys.atoms[i]), thermostat.temperature, sys.k; @@ -70,9 +70,9 @@ function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat, return false end -function apply_coupling!(sys::System{D, true, T}, thermostat::AndersenThermostat, sim, +function apply_coupling!(sys::System{D, AT, T}, thermostat::AndersenThermostat, sim, neighbors=nothing, step_n::Integer=0; - n_threads::Integer=Threads.nthreads()) where {D, T} + n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T} atoms_to_bump = T.(rand(length(sys)) .< (sim.dt / thermostat.coupling_const)) atoms_to_leave = one(T) .- atoms_to_bump atoms_to_bump_dev = move_array(atoms_to_bump, sys) diff --git a/src/energy.jl b/src/energy.jl index c0f10c6a..d0403c49 100644 --- a/src/energy.jl +++ b/src/energy.jl @@ -72,8 +72,8 @@ function potential_energy(sys; n_threads::Integer=Threads.nthreads()) return potential_energy(sys, find_neighbors(sys; n_threads=n_threads); n_threads=n_threads) end -function potential_energy(sys::System{D, false, T}, neighbors, step_n::Integer=0; - n_threads::Integer=Threads.nthreads()) where {D, T} +function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0; + n_threads::Integer=Threads.nthreads()) where {D, AT, T} pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters)) pairwise_inters_nl = filter( use_neighbors, values(sys.pairwise_inters)) sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists)) @@ -247,10 +247,11 @@ function specific_pe(atoms, coords, velocities, boundary, energy_units, sils_1_a return pe end -function potential_energy(sys::System{D, true, T}, neighbors, step_n::Integer=0; - n_threads::Integer=Threads.nthreads()) where {D, T} - pe_vec_nounits = CUDA.zeros(T, 1) +function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0; + n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T} + n_atoms = length(sys) val_ft = Val(T) + pe_vec_nounits = KernelAbstractions.zeros(get_backend(sys.coords), T, 1) pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters)) if length(pairwise_inters_nonl) > 0 diff --git a/src/force.jl b/src/force.jl index 760089a5..52bad345 100644 --- a/src/force.jl +++ b/src/force.jl @@ -123,8 +123,9 @@ function init_forces_buffer(forces_nounits, n_threads) end end -function init_forces_buffer(forces_nounits::CuArray{SVector{D, T}}, n_threads) where {D, T} - return CUDA.zeros(T, D, length(forces_nounits)) +function init_forces_buffer(forces_nounits::AbstractGPUArray{SVector{D, T}}, n_threads) where {D, T} + return KernelAbstractions.zeros(get_backend(forces_nounits), + T, D, length(forces_nounits)) end """ @@ -346,9 +347,9 @@ function specific_forces!(fs_nounits, atoms, coords, velocities, boundary, force return fs_nounits end -function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors, +function forces_nounits!(fs_nounits, sys::System{D, AT, T}, neighbors, fs_mat=CUDA.zeros(T, D, length(sys)), step_n::Integer=0; - n_threads::Integer=Threads.nthreads()) where {D, T} + n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T} fill!(fs_mat, zero(T)) val_ft = Val(T) diff --git a/src/interactions/implicit_solvent.jl b/src/interactions/implicit_solvent.jl index 44e81da8..84ba7f08 100644 --- a/src/interactions/implicit_solvent.jl +++ b/src/interactions/implicit_solvent.jl @@ -411,10 +411,11 @@ function ImplicitSolventOBC(atoms::AbstractArray{Atom{TY, M, T, D, E}}, factor_solvent = zero(T(coulomb_const_units)) end - if isa(atoms, CuArray) - or = CuArray(offset_radii) - sor = CuArray(scaled_offset_radii) - is, js = CuArray(inds_i), CuArray(inds_j) + if isa(atoms, AbstractGPUArray) + ArrayType = get_array_type(atoms) + or = ArrayType(offset_radii) + sor = ArrayType(scaled_offset_radii) + is, js = ArrayType(inds_i), ArrayType(inds_j) else or = offset_radii sor = scaled_offset_radii @@ -563,12 +564,13 @@ function ImplicitSolventGBN2(atoms::AbstractArray{Atom{TY, M, T, D, E}}, factor_solvent = zero(T(coulomb_const_units)) end - if isa(atoms, CuArray) - or = CuArray(offset_radii) - sor = CuArray(scaled_offset_radii) - is, js = CuArray(inds_i), CuArray(inds_j) - d0s, m0s = CuArray(table_d0), CuArray(table_m0) - αs, βs, γs = CuArray(αs_cpu), CuArray(βs_cpu), CuArray(γs_cpu) + if isa(atoms, AbstractGPUArray) + ArrayType = fine_array_type(atoms) + or = ArrayType(offset_radii) + sor = ArrayType(scaled_offset_radii) + is, js = ArrayType(inds_i), ArrayType(inds_j) + d0s, m0s = ArrayType(table_d0), ArrayType(table_m0) + αs, βs, γs = ArrayType(αs_cpu), ArrayType(βs_cpu), ArrayType(γs_cpu) else or = offset_radii sor = scaled_offset_radii @@ -694,7 +696,7 @@ function born_radii_and_grad(inter::ImplicitSolventOBC{T}, coords, boundary) whe return Bs, B_grads, I_grads end -function born_radii_and_grad(inter::ImplicitSolventOBC, coords::CuArray, boundary) +function born_radii_and_grad(inter::ImplicitSolventOBC, coords::AbstractGPUArray, boundary) coords_i = @view coords[inter.is] coords_j = @view coords[inter.js] loop_res = born_radii_loop_OBC.(coords_i, coords_j, inter.oris, inter.srjs, @@ -766,7 +768,7 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords, boundary) wh return Bs, B_grads, I_grads end -function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, boundary) where T +function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::AbstractGPUArray, boundary) where T Is, I_grads = gbsa_born_gpu(coords, inter.offset_radii, inter.scaled_offset_radii, inter.dist_cutoff, inter.offset, inter.neck_scale, inter.neck_cut, inter.d0s, inter.m0s, boundary, Val(T)) @@ -778,42 +780,41 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, bou return Bs, B_grads, I_grads end -function cuda_threads_blocks_gbsa(n_inters) +function gpu_threads_blocks_gbsa(n_inters) n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_IMPLICIT", "512")) - n_blocks = cld(n_inters, n_threads_gpu) - return n_threads_gpu, n_blocks + return n_threads_gpu end function gbsa_born_gpu(coords::AbstractArray{SVector{D, C}}, offset_radii, scaled_offset_radii, dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary, ::Val{T}) where {D, C, T} + backend = get_backend(coords) n_atoms = length(coords) - Is_nounits = CUDA.zeros(T, n_atoms) - I_grads_nounits = CUDA.zeros(T, n_atoms, n_atoms) + Is_nounits = KernelAbstractions.zeros(backend, T, n_atoms) + I_grads_nounits = KernelAbstractions.zeros(backend, T, n_atoms, n_atoms) n_inters = n_atoms ^ 2 - n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters) + n_threads_gpu = gpu_threads_blocks_gbsa(n_inters) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_born_kernel!( - Is_nounits, I_grads_nounits, coords, offset_radii, scaled_offset_radii, - dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary, Val(C)) + kernel! = gbsa_born_kernel!(backend, n_threads_gpu) + kernel!(Is_nounits, I_grads_nounits, coords, offset_radii, + scaled_offset_radii, dist_cutoff, offset, neck_scale, + neck_cut, d0s, m0s, boundary, Val(C), ndrange = n_inters) Is = Is_nounits * unit(dist_cutoff)^-1 I_grads = I_grads_nounits * unit(dist_cutoff)^-2 return Is, I_grads end -function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_offset_radii_var, - dist_cutoff, offset, neck_scale, neck_cut, d0s_var, m0s_var, boundary, - ::Val{C}) where C - coords = CUDA.Const(coords_var) - offset_radii = CUDA.Const(offset_radii_var) - scaled_offset_radii = CUDA.Const(scaled_offset_radii_var) - d0s = CUDA.Const(d0s_var) - m0s = CUDA.Const(m0s_var) +@kernel function gbsa_born_kernel!(Is, I_grads, @Const(coords), + @Const(offset_radii), + @Const(scaled_offset_radii), + dist_cutoff, offset, neck_scale, neck_cut, + @Const(d0s), @Const(m0s), boundary, + ::Val{C}) where C n_atoms = length(coords) n_inters = n_atoms ^ 2 - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + inter_i = @index(Global, Linear) @inbounds if inter_i <= n_inters i = cld(inter_i, n_atoms) @@ -854,7 +855,6 @@ function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_off end end end - return nothing end function gb_force_loop_1(coord_i, coord_j, i, j, charge_i, charge_j, Bi, Bj, dist_cutoff, @@ -965,16 +965,17 @@ end function gbsa_force_1_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff, factor_solute, factor_solvent, kappa, Bs, atom_charges::AbstractArray{T}, force_units) where {D, C, T} + backend = get_backend(coords) n_atoms = length(coords) - fs_mat = CUDA.zeros(T, D, n_atoms) - born_forces_mod_ustrip = CUDA.zeros(T, n_atoms) + fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms) + born_forces_mod_ustrip = KernelAbstractions.zeros(backend, T, n_atoms) n_inters = n_atoms_to_n_pairs(n_atoms) + n_atoms - n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters) + n_threads_gpu = gpu_threads_blocks_gbsa(n_inters) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_1_kernel!( - fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff, - factor_solute, factor_solvent, kappa, Bs, atom_charges, - Val(D), Val(force_units)) + kernel! = gbsa_force_1_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff, + factor_solute, factor_solvent, kappa, Bs, atom_charges, + Val(D), Val(force_units), ndrange = n_inters) return fs_mat, born_forces_mod_ustrip end @@ -982,29 +983,30 @@ end function gbsa_force_2_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff, Bs, B_grads, I_grads, born_forces, offset_radii, scaled_offset_radii, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) n_atoms = length(coords) - fs_mat = CUDA.zeros(T, D, n_atoms) + fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms) n_inters = n_atoms ^ 2 - n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters) + n_threads_gpu = gpu_threads_blocks_gbsa(n_inters) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_2_kernel!( - fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii, - scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units)) + kernel! = gbsa_force_2_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii, + scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units), + ndrange = n_inters) return fs_mat end -function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, boundary, dist_cutoff, - factor_solute, factor_solvent, kappa, Bs_var, atom_charges_var, - ::Val{D}, ::Val{F}) where {D, F} - coords = CUDA.Const(coords_var) - Bs = CUDA.Const(Bs_var) - atom_charges = CUDA.Const(atom_charges_var) +@kernel function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, + @Const(coords), boundary, dist_cutoff, + factor_solute, factor_solvent, kappa, + @Const(Bs), @Const(atom_charges), + ::Val{D}, ::Val{F}) where {D, F} n_atoms = length(coords) n_inters_not_self = n_atoms_to_n_pairs(n_atoms) n_inters = n_inters_not_self + n_atoms - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + inter_i = @index(Global, Linear) @inbounds if inter_i <= n_inters if inter_i <= n_inters_not_self @@ -1050,22 +1052,17 @@ function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, bounda end end end - return nothing end -function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cutoff, or_var, - sor_var, Bs_var, B_grads_var, I_grads_var, ::Val{D}, - ::Val{F}) where {D, F} - coords = CUDA.Const(coords_var) - or = CUDA.Const(or_var) - sor = CUDA.Const(sor_var) - Bs = CUDA.Const(Bs_var) - B_grads = CUDA.Const(B_grads_var) - I_grads = CUDA.Const(I_grads_var) +@kernel function gbsa_force_2_kernel!(forces, born_forces, @Const(coords), + boundary, dist_cutoff, @Const(or), + @Const(sor), @Const(Bs), + @Const(B_grads), @Const(I_grads), + ::Val{D}, ::Val{F}) where {D, F} n_atoms = length(coords) n_inters = n_atoms ^ 2 - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + inter_i = @index(Global, Linear) @inbounds if inter_i <= n_inters i = cld(inter_i, n_atoms) @@ -1098,7 +1095,6 @@ function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cu end end end - return nothing end function AtomsCalculators.forces(sys, inter::AbstractGBSA; kwargs...) diff --git a/src/kernels.jl b/src/kernels.jl new file mode 100644 index 00000000..274c71bd --- /dev/null +++ b/src/kernels.jl @@ -0,0 +1,345 @@ +# KernelAbstractions.jl kernels + +function get_array_type(a::AT) where AT <: AbstractArray + return AT.name.wrapper +end + +@inline function sum_pairwise_forces(inters, coord_i, coord_j, atom_i, atom_j, + boundary, special, ::Val{F}) where F + dr = vector(coord_i, coord_j, boundary) + f_tuple = ntuple(length(inters)) do inter_type_i + force_gpu(inters[inter_type_i], dr, coord_i, coord_j, atom_i, atom_j, boundary, special) + end + f = sum(f_tuple) + if unit(f[1]) != F + # This triggers an error but it isn't printed + # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692 + # for how to throw a more meaningful error + error("wrong force unit returned, was expecting $F but got $(unit(f[1]))") + end + return f +end + +function gpu_threads_blocks_pairwise(n_neighbors) + n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512")) + return n_threads_gpu +end + +function gpu_threads_blocks_specific(n_inters) + n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_SPECIFIC", "128")) + return n_threads_gpu +end + +function pairwise_force_gpu(coords::AbstractArray{SVector{D, C}}, atoms, boundary, + pairwise_inters, nbs, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + fs_mat = KernelAbstractions.zeros(backend, T, D, length(atoms)) + n_threads_gpu = gpu_threads_blocks_pairwise(length(nbs)) + kernel! = pairwise_force_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, coords, atoms, boundary, pairwise_inters, nbs, + Val(D), Val(force_units), ndrange = length(nbs)) + return fs_mat +end + +@kernel function pairwise_force_kernel!(forces, @Const(coords), + @Const(atoms), boundary, inters, + @Const(neighbors), ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(neighbors) + i, j, special = neighbors[inter_i] + f = sum_pairwise_forces(inters, coords[i], coords[j], atoms[i], atoms[j], boundary, special, Val(F)) + for dim in 1:D + fval = ustrip(f[dim]) + Atomix.@atomic forces[dim, i] = forces[dim, i] - fval + Atomix.@atomic forces[dim, j] = forces[dim, j] + fval + end + end +end + +function specific_force_gpu(inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}}, + boundary, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + fs_mat = KernelAbstractions.zeros(backend, T, D, length(coords)) + n_threads_gpu = gpu_threads_blocks_specific(length(inter_list)) + kernel! = specific_force_1_atoms_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, coords, boundary, inter_list.is, inter_list.inters, + Val(D), Val(force_units), ndrange = length(inter_list)) + return fs_mat +end + +function specific_force_gpu(inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}}, + boundary, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + fs_mat = KernelAbstractions.zeros(backend, T, D, length(coords)) + n_threads_gpu = gpu_threads_blocks_specific(length(inter_list)) + kernel! = specific_force_2_atoms_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, coords, boundary, inter_list.is, inter_list.js, + inter_list.inters, Val(D), Val(force_units), + ndrange = length(inter_list)) + return fs_mat +end + +function specific_force_gpu(inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}}, + boundary, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + fs_mat = KernelAbstractions.zeros(backend, T, D, length(coords)) + n_threads_gpu = gpu_threads_blocks_specific(length(inter_list)) + kernel! = specific_force_3_atoms_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, coords, boundary, inter_list.is, inter_list.js, + inter_list.ks, inter_list.inters, Val(D), Val(force_units), + ndrange = length(inter_list)) + return fs_mat +end + +function specific_force_gpu(inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}}, + boundary, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + fs_mat = KernelAbstractions.zeros(backend, T, D, length(coords)) + n_threads_gpu = gpu_threads_blocks_specific(length(inter_list)) + kernel! = specific_force_4_atoms_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, coords, boundary, inter_list.is, inter_list.js, + inter_list.ks, inter_list.ls, inter_list.inters, Val(D), + Val(force_units), ndrange = length(inter_list)) + return fs_mat +end + +@kernel function specific_force_1_atoms_kernel!(forces, @Const(coords), + boundary, @Const(is), + @Const(inters), ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(is) + i = is[inter_i] + fs = force_gpu(inters[inter_i], coords[i], boundary) + if unit(fs.f1[1]) != F + error("wrong force unit returned, was expecting $F") + end + for dim in 1:D + Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim]) + end + end +end + +@kernel function specific_force_2_atoms_kernel!(forces, @Const(coords), + boundary, @Const(is), + @Const(js), + @Const(inters), ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(is) + i, j = is[inter_i], js[inter_i] + fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary) + if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F + error("wrong force unit returned, was expecting $F") + end + for dim in 1:D + Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim]) + Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim]) + end + end +end + +@kernel function specific_force_3_atoms_kernel!(forces, @Const(coords), + boundary, @Const(is), + @Const(js), @Const(ks), + @Const(inters), ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(is) + i, j, k = is[inter_i], js[inter_i], ks[inter_i] + fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary) + if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F + error("wrong force unit returned, was expecting $F") + end + for dim in 1:D + Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim]) + Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim]) + Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim]) + end + end +end + +@kernel function specific_force_4_atoms_kernel!(forces, @Const(coords), + boundary, @Const(is), + @Const(js), @Const(ks), + @Const(ls), + @Const(inters), ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(is) + i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i] + fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary) + if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F + error("wrong force unit returned, was expecting $F") + end + for dim in 1:D + Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim]) + Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim]) + Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim]) + Atomix.@atomic :monotonic forces[dim, l] += ustrip(fs.f4[dim]) + end + end +end + +function pairwise_pe_gpu(coords::AbstractArray{SVector{D, C}}, atoms, boundary, + pairwise_inters, nbs, energy_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + pe_vec = KernelAbstractions.zeros(backend, T, 1) + n_threads_gpu = gpu_threads_blocks_pairwise(length(nbs)) + kernel! = pairwise_pe_kernel!(backend, n_threads_gpu) + kernel!(pe_vec, coords, atoms, boundary, pairwise_inters, nbs, + Val(energy_units), ndrange = length(nbs)) + return pe_vec +end + +@kernel function pairwise_pe_kernel!(energy, @Const(coords), + @Const(atoms), boundary, inters, + @Const(neighbors), ::Val{E}) where E + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(neighbors) + i, j, special = neighbors[inter_i] + coord_i, coord_j = coords[i], coords[j] + dr = vector(coord_i, coord_j, boundary) + pe = potential_energy_gpu(inters[1], dr, coord_i, coord_j, atoms[i], atoms[j], + boundary, special) + for inter in inters[2:end] + pe += potential_energy_gpu(inter, dr, coord_i, coord_j, atoms[i], atoms[j], + boundary, special) + end + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic :monotonic energy[1] += ustrip(pe) + end +end + +function specific_pe_gpu(inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}}, + boundary, energy_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + pe_vec = KernelAbstractions.zeros(backend, T, 1) + n_threads_gpu = gpu_threads_blocks_specific(length(inter_list)) + kernel! = specific_pe_1_atoms_kernel!(backend, n_threads_gpu) + kernel!(pe_vec, coords, boundary, inter_list.is, inter_list.inters, + Val(energy_units), ndrange = length(inter_list)) + return pe_vec +end + +function specific_pe_gpu(inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}}, + boundary, energy_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + pe_vec = KernelAbstractions.zeros(backend, T, 1) + n_threads_gpu = gpu_threads_blocks_specific(length(inter_list)) + kernel! = specific_pe_2_atoms_kernel!(backend, n_threads_gpu) + kernel!(pe_vec, coords, boundary, inter_list.is, inter_list.js, + inter_list.inters, Val(energy_units), ndrange = length(inter_list)) + return pe_vec +end + +function specific_pe_gpu(inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}}, + boundary, energy_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + pe_vec = KernelAbstractions.zeros(backend, T, 1) + n_threads_gpu = gpu_threads_blocks_specific(length(inter_list)) + kernel! = specific_pe_3_atoms_kernel!(backend, n_threads_gpu) + kernel!(pe_vec, coords, boundary, inter_list.is, inter_list.js, + inter_list.ks, inter_list.inters, Val(energy_units), + ndrange = length(inter_list)) + return pe_vec +end + +function specific_pe_gpu(inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}}, + boundary, energy_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + pe_vec = KernelAbstractions.zeros(backend, T, 1) + n_threads_gpu = gpu_threads_blocks_specific(length(inter_list)) + kernel! = specific_pe_4_atoms_kernel!(backend, n_threads_gpu) + kernel!(pe_vec, coords, boundary, inter_list.is, inter_list.js, + inter_list.ks, inter_list.ls, inter_list.inters, + Val(energy_units), ndrange = length(inter_list)) + return pe_vec +end + +@kernel function specific_pe_1_atoms_kernel!(energy, @Const(coords), + boundary, @Const(is), + @Const(inters), + ::Val{E}) where E + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(is) + i = is[inter_i] + pe = potential_energy_gpu(inters[inter_i], coords[i], boundary) + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic :monotonic energy[1] += ustrip(pe) + end +end + +@kernel function specific_pe_2_atoms_kernel!(energy, @Const(coords), + boundary, @Const(is), + @Const(js), + @Const(inters), + ::Val{E}) where E + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(is) + i, j = is[inter_i], js[inter_i] + pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary) + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic :monotonic energy[1] += ustrip(pe) + end +end + +@kernel function specific_pe_3_atoms_kernel!(energy, @Const(coords), + boundary, @Const(is), + @Const(js), @Const(ks), + @Const(inters), + ::Val{E}) where E + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(is) + i, j, k = is[inter_i], js[inter_i], ks[inter_i] + pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary) + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic :monotonic energy[1] += ustrip(pe) + end +end + +@kernel function specific_pe_4_atoms_kernel!(energy, @Const(coords), + boundary, @Const(is), + @Const(js), @Const(ks), + @Const(ls), + @Const(inters), + ::Val{E}) where E + + inter_i = @index(Global, Linear) + + @inbounds if inter_i <= length(is) + i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i] + pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary) + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic :monotonic energy[1] += ustrip(pe) + end +end diff --git a/src/neighbors.jl b/src/neighbors.jl index 5289496c..e238fff8 100644 --- a/src/neighbors.jl +++ b/src/neighbors.jl @@ -64,12 +64,12 @@ function DistanceNeighborFinder(; eligible, dist_cutoff, special, n_steps, zero(eligible)) end -function find_neighbors(sys::System{D, false}, +function find_neighbors(sys::System{D, AT}, nf::DistanceNeighborFinder, current_neighbors=nothing, step_n::Integer=0, force_recompute::Bool=false; - n_threads::Integer=Threads.nthreads()) where D + n_threads::Integer=Threads.nthreads()) where {D, AT} if !force_recompute && !iszero(step_n % nf.n_steps) return current_neighbors end @@ -92,20 +92,19 @@ function find_neighbors(sys::System{D, false}, return NeighborList(length(neighbors_list), neighbors_list) end -function cuda_threads_blocks_dnf(n_inters) +function gpu_threads_blocks_dnf(n_inters) n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_DISTANCENF", "512")) - n_blocks = cld(n_inters, n_threads_gpu) - return n_threads_gpu, n_blocks + return n_threads_gpu end -function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var, - boundary, sq_dist_neighbors) - coords = CUDA.Const(coords_var) - eligible = CUDA.Const(eligible_var) +@kernel function distance_neighbor_finder_kernel!(neighbors, + @Const(coords), + @Const(eligible), + boundary, sq_dist_neighbors) n_atoms = length(coords) n_inters = n_atoms_to_n_pairs(n_atoms) - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + inter_i = @index(Global, Linear) @inbounds if inter_i <= n_inters i, j = pair_index(n_atoms, inter_i) @@ -117,28 +116,28 @@ function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var, end end end - return nothing end lists_to_tuple_list(i, j, w) = (Int32(i), Int32(j), w) -function find_neighbors(sys::System{D, true}, +function find_neighbors(sys::System{D, AT}, nf::DistanceNeighborFinder, current_neighbors=nothing, step_n::Integer=0, force_recompute::Bool=false; - kwargs...) where D + kwargs...) where {D, AT <: AbstractGPUArray} if !force_recompute && !iszero(step_n % nf.n_steps) return current_neighbors end nf.neighbors .= false n_inters = n_atoms_to_n_pairs(length(sys)) - n_threads_gpu, n_blocks = cuda_threads_blocks_dnf(n_inters) + n_threads_gpu = gpu_threads_blocks_dnf(n_inters) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks distance_neighbor_finder_kernel!( - nf.neighbors, sys.coords, nf.eligible, sys.boundary, nf.dist_cutoff^2, - ) + backend = get_backend(sys.coords) + kernel! = distance_neighbor_finder_kernel!(backend, n_threads_gpu) + kernel!(nf.neighbors, sys.coords, nf.eligible, sys.boundary, + nf.dist_cutoff^2, ndrange = n_inters) pairs = findall(nf.neighbors) nbsi, nbsj = getindex.(pairs, 1), getindex.(pairs, 2) @@ -307,19 +306,19 @@ function reduce_pairs(neighbors::NeighborList, neighbors_threaded::Vector{Neighb return neighbors end -function find_neighbors(sys::System{D, G}, +function find_neighbors(sys::System{D, AT}, nf::CellListMapNeighborFinder, current_neighbors=nothing, step_n::Integer=0, force_recompute::Bool=false; - n_threads::Integer=Threads.nthreads()) where {D, G} + n_threads::Integer=Threads.nthreads()) where {D, AT} if !force_recompute && !iszero(step_n % nf.n_steps) return current_neighbors end if isnothing(current_neighbors) neighbors = NeighborList() - elseif G + elseif AT <: AbstractGPUArray neighbors = NeighborList(current_neighbors.n, Array(current_neighbors.list)) else neighbors = current_neighbors @@ -351,8 +350,8 @@ function find_neighbors(sys::System{D, G}, ) nf.cl = cl - if G - return NeighborList(neighbors.n, CuArray(neighbors.list)) + if AT <: AbstractGPUArray + return NeighborList(neighbors.n, AT(neighbors.list)) else return neighbors end diff --git a/src/setup.jl b/src/setup.jl index 978ad31a..7a519b5b 100644 --- a/src/setup.jl +++ b/src/setup.jl @@ -428,8 +428,8 @@ are not available when reading Gromacs files. - `loggers=()`: the loggers that record properties of interest during a simulation. - `units::Bool=true`: whether to use Unitful quantities. -- `gpu::Bool=false`: whether to move the relevant parts of the system onto - the GPU. +- `ArrayType::AbstractArray = Array`: The ArrayType desired for the simulation + (for GPU support, use CuArray or ROCArray) - `dist_cutoff=1.0u"nm"`: cutoff distance for long-range interactions. - `dist_neighbors=1.2u"nm"`: cutoff distance for the neighbor list, should be greater than `dist_cutoff`. @@ -452,7 +452,7 @@ function System(coord_file::AbstractString, velocities=nothing, loggers=(), units::Bool=true, - gpu::Bool=false, + ArrayType::Type{AT} where AT <: AbstractArray = Array, dist_cutoff=units ? 1.0u"nm" : 1.0, dist_neighbors=units ? 1.2u"nm" : 1.2, center_coords::Bool=true, @@ -824,9 +824,9 @@ function System(coord_file::AbstractString, specific_inter_array = [] if length(bonds.is) > 0 push!(specific_inter_array, InteractionList2Atoms( - gpu ? CuArray(bonds.is) : bonds.is, - gpu ? CuArray(bonds.js) : bonds.js, - gpu ? CuArray([bonds.inters...]) : [bonds.inters...], + ArrayType(bonds.is), + ArrayType(bonds.js), + ArrayType([bonds.inters...]), bonds.types, )) topology = MolecularTopology(bonds.is, bonds.js, n_atoms) @@ -835,30 +835,30 @@ function System(coord_file::AbstractString, end if length(angles.is) > 0 push!(specific_inter_array, InteractionList3Atoms( - gpu ? CuArray(angles.is) : angles.is, - gpu ? CuArray(angles.js) : angles.js, - gpu ? CuArray(angles.ks) : angles.ks, - gpu ? CuArray([angles.inters...]) : [angles.inters...], + ArrayType(angles.is), + ArrayType(angles.js), + ArrayType(angles.ks), + ArrayType([angles.inters...]), angles.types, )) end if length(torsions.is) > 0 push!(specific_inter_array, InteractionList4Atoms( - gpu ? CuArray(torsions.is) : torsions.is, - gpu ? CuArray(torsions.js) : torsions.js, - gpu ? CuArray(torsions.ks) : torsions.ks, - gpu ? CuArray(torsions.ls) : torsions.ls, - gpu ? CuArray(torsion_inters_pad) : torsion_inters_pad, + ArrayType(torsions.is), + ArrayType(torsions.js), + ArrayType(torsions.ks), + ArrayType(torsions.ls), + ArrayType(torsion_inters_pad), torsions.types, )) end if length(impropers.is) > 0 push!(specific_inter_array, InteractionList4Atoms( - gpu ? CuArray(impropers.is) : impropers.is, - gpu ? CuArray(impropers.js) : impropers.js, - gpu ? CuArray(impropers.ks) : impropers.ks, - gpu ? CuArray(impropers.ls) : impropers.ls, - gpu ? CuArray(improper_inters_pad) : improper_inters_pad, + ArrayType(impropers.is), + ArrayType(impropers.js), + ArrayType(impropers.ks), + ArrayType(impropers.ls), + ArrayType(improper_inters_pad), impropers.types, )) end @@ -887,10 +887,10 @@ function System(coord_file::AbstractString, end coords = wrap_coords.(coords, (boundary_used,)) - if gpu || !use_cell_list + if (ArrayType <: AbstractGPUArray) || !use_cell_list neighbor_finder = DistanceNeighborFinder( - eligible=(gpu ? CuArray(eligible) : eligible), - special=(gpu ? CuArray(special) : special), + eligible=(ArrayType(eligible)), + special=(ArrayType(special)), n_steps=10, dist_cutoff=T(dist_neighbors), ) @@ -904,13 +904,9 @@ function System(coord_file::AbstractString, dist_cutoff=T(dist_neighbors), ) end - if gpu - atoms = CuArray([atoms_abst...]) - coords_dev = CuArray(coords) - else - atoms = [atoms_abst...] - coords_dev = coords - end + + atoms = ArrayType(atoms) + coords = ArrayType(coords) if isnothing(velocities) if units @@ -965,7 +961,7 @@ function System(T::Type, velocities=nothing, loggers=(), units::Bool=true, - gpu::Bool=false, + ArrayType::Type{AT} where AT <: AbstractArray = Array, dist_cutoff=units ? 1.0u"nm" : 1.0, dist_neighbors=units ? 1.2u"nm" : 1.2, center_coords::Bool=true, @@ -1246,9 +1242,9 @@ function System(T::Type, specific_inter_array = [] if length(bonds.is) > 0 push!(specific_inter_array, InteractionList2Atoms( - gpu ? CuArray(bonds.is) : bonds.is, - gpu ? CuArray(bonds.js) : bonds.js, - gpu ? CuArray([bonds.inters...]) : [bonds.inters...], + ArrayType(bonds.is), + ArrayType(bonds.js), + ArrayType([bonds.inters...]), bonds.types, )) topology = MolecularTopology(bonds.is, bonds.js, n_atoms) @@ -1257,29 +1253,29 @@ function System(T::Type, end if length(angles.is) > 0 push!(specific_inter_array, InteractionList3Atoms( - gpu ? CuArray(angles.is) : angles.is, - gpu ? CuArray(angles.js) : angles.js, - gpu ? CuArray(angles.ks) : angles.ks, - gpu ? CuArray([angles.inters...]) : [angles.inters...], + ArrayType(angles.is), + ArrayType(angles.js), + ArrayType(angles.ks), + ArrayType([angles.inters...]), angles.types, )) end if length(torsions.is) > 0 push!(specific_inter_array, InteractionList4Atoms( - gpu ? CuArray(torsions.is) : torsions.is, - gpu ? CuArray(torsions.js) : torsions.js, - gpu ? CuArray(torsions.ks) : torsions.ks, - gpu ? CuArray(torsions.ls) : torsions.ls, - gpu ? CuArray([torsions.inters...]) : [torsions.inters...], + ArrayType(torsions.is), + ArrayType(torsions.js), + ArrayType(torsions.ks), + ArrayType(torsions.ls), + ArrayType([torsions.inters...]), torsions.types, )) end specific_inter_lists = tuple(specific_inter_array...) - if gpu || !use_cell_list + if ArrayType <: AbstractGPUArray || !use_cell_list neighbor_finder = DistanceNeighborFinder( - eligible=(gpu ? CuArray(eligible) : eligible), - special=(gpu ? CuArray(special) : special), + eligible=(ArrayType(eligible)), + special=(ArrayType(special)), n_steps=10, dist_cutoff=T(dist_neighbors), ) @@ -1293,13 +1289,9 @@ function System(T::Type, dist_cutoff=T(dist_neighbors), ) end - if gpu - atoms = CuArray([atoms_abst...]) - coords_dev = CuArray(coords) - else - atoms = [atoms_abst...] - coords_dev = coords - end + + atoms = ArrayType(atoms) + coords = ArrayType(coords) if isnothing(velocities) if units diff --git a/src/simulators.jl b/src/simulators.jl index eb1e9e8b..8eea8973 100644 --- a/src/simulators.jl +++ b/src/simulators.jl @@ -818,12 +818,12 @@ Attempt an exchange of replicas `n` and `m` in a [`ReplicaSystem`](@ref) during Successful exchanges should exchange coordinates and velocities as appropriate. Returns acceptance quantity `Δ` and a `Bool` indicating whether the exchange was successful. """ -function remd_exchange!(sys::ReplicaSystem{D, G, T}, +function remd_exchange!(sys::ReplicaSystem{D, AT, T}, sim::TemperatureREMD, n::Integer, m::Integer; n_threads::Integer=Threads.nthreads(), - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) where {D, AT, T} T_n, T_m = sim.temperatures[n], sim.temperatures[m] β_n, β_m = inv(sys.k * T_n), inv(sys.k * T_m) neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder; @@ -909,12 +909,12 @@ function simulate!(sys::ReplicaSystem, return simulate_remd!(sys, sim, n_steps; rng=rng, n_threads=n_threads, run_loggers=run_loggers) end -function remd_exchange!(sys::ReplicaSystem{D, G, T}, +function remd_exchange!(sys::ReplicaSystem{D, AT, T}, sim::HamiltonianREMD, n::Integer, m::Integer; n_threads::Integer=Threads.nthreads(), - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) where {D, AT, T} T_sim = sim.temperature β_sim = inv(sys.k * T_sim) neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder; @@ -1034,11 +1034,11 @@ function MetropolisMonteCarlo(; temperature, trial_moves, trial_args=Dict()) return MetropolisMonteCarlo(temperature, trial_moves, trial_args) end -@inline function simulate!(sys::System{D, G, T}, +@inline function simulate!(sys::System{D, AT, T}, sim::MetropolisMonteCarlo, n_steps::Integer; n_threads::Integer=Threads.nthreads(), - run_loggers=true) where {D, G, T} + run_loggers=true) where {D, AT, T} neighbors = find_neighbors(sys, sys.neighbor_finder; n_threads=n_threads) E_old = potential_energy(sys, neighbors; n_threads=n_threads) coords_old = similar(sys.coords) @@ -1075,8 +1075,8 @@ Performs a random translation of the coordinates of a randomly selected atom in The translation is generated using a uniformly selected direction and uniformly selected length in range [0, 1) scaled by `shift_size` which should have appropriate length units. """ -function random_uniform_translation!(sys::System{D, G, T}; - shift_size=oneunit(eltype(eltype(sys.coords)))) where {D, G, T} +function random_uniform_translation!(sys::System{D, AT, T}; + shift_size=oneunit(eltype(eltype(sys.coords)))) where {D, AT, T} rand_idx = rand(eachindex(sys)) direction = random_unit_vector(T, D) magnitude = rand(T) * shift_size @@ -1093,8 +1093,8 @@ The translation is generated using a uniformly chosen direction and length selec the standard normal distribution i.e. with mean 0 and standard deviation 1, scaled by `shift_size` which should have appropriate length units. """ -function random_normal_translation!(sys::System{D, G, T}; - shift_size=oneunit(eltype(eltype(sys.coords)))) where {D, G, T} +function random_normal_translation!(sys::System{D, AT, T}; + shift_size=oneunit(eltype(eltype(sys.coords)))) where {D, AT, T} rand_idx = rand(eachindex(sys)) direction = random_unit_vector(T, D) magnitude = randn(T) * shift_size diff --git a/src/spatial.jl b/src/spatial.jl index de0941b1..07502a9f 100644 --- a/src/spatial.jl +++ b/src/spatial.jl @@ -614,11 +614,11 @@ function random_velocities(sys::AtomsBase.AbstractSystem{2}, temp; rng=Random.de end function random_velocities(sys::System{3, true}, temp; rng=Random.default_rng()) - return CuArray(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng)) + return AT(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng)) end function random_velocities(sys::System{2, true}, temp; rng=Random.default_rng()) - return CuArray(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng)) + return AT(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng)) end """ @@ -738,9 +738,9 @@ function virial(sys, neighbors, step_n::Integer=0; n_threads::Integer=Threads.nt return v end -function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_nonl, - pairwise_inters_nl) where {D, G, T} - if G +function virial(sys::System{D, AT, T}, neighbors_dev, step_n, pairwise_inters_nonl, + pairwise_inters_nl) where {D, AT, T} + if AT <: AbstractGPUArray coords, velocities, atoms = Array(sys.coords), Array(sys.velocities), Array(sys.atoms) if isnothing(neighbors_dev) neighbors = neighbors_dev @@ -792,7 +792,7 @@ function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_non end # Default for general interactions -function virial(inter, sys::System{D, G, T}, args...; kwargs...) where {D, G, T} +function virial(inter, sys::System{D, AT, T}, args...; kwargs...) where {D, AT, T} return zero(T) * sys.energy_units end @@ -874,8 +874,9 @@ function molecule_centers(coords::AbstractArray{SVector{D, C}}, boundary, topolo end end -function molecule_centers(coords::CuArray, boundary, topology) - return CuArray(molecule_centers(Array(coords), boundary, topology)) +function molecule_centers(coords::AbstractGPUArray, boundary, topology) + ArrayType = get_array_type(coords) + return ArrayType(molecule_centers(Array(coords), boundary, topology)) end # Allows scaling multiple vectors at once by broadcasting this function diff --git a/src/types.jl b/src/types.jl index 225d9cff..977fbe7b 100644 --- a/src/types.jl +++ b/src/types.jl @@ -431,8 +431,6 @@ Base.firstindex(::NoNeighborList) = 1 Base.lastindex(nl::NoNeighborList) = length(nl) Base.eachindex(nl::NoNeighborList) = Base.OneTo(length(nl)) -CUDA.Const(nl::NoNeighborList) = nl - """ System(; ) @@ -481,8 +479,8 @@ interface described there. modified in some simulations. `k` is chosen based on the `energy_units` given. - `data::DA=nothing`: arbitrary data associated with the system. """ -mutable struct System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, - L, F, E, K, M, DA} <: AtomsBase.AbstractSystem{D} +mutable struct System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, + L, F, E, K, M, DA} <: AbstractSystem{D} atoms::A coords::C boundary::B @@ -521,7 +519,7 @@ function System(; k=default_k(energy_units), data=nothing) D = AtomsBase.n_dimensions(boundary) - G = isa(coords, CuArray) + AT = get_array_type(coords) T = float_type(boundary) A = typeof(atoms) C = typeof(coords) @@ -567,19 +565,19 @@ function System(; end end - if isa(atoms, CuArray) && !isa(coords, CuArray) + if isa(atoms, AbstractGPUArray) && !isa(coords, AbstractGPUArray) throw(ArgumentError("the atoms are on the GPU but the coordinates are not")) end - if isa(coords, CuArray) && !isa(atoms, CuArray) + if isa(coords, AbstractGPUArray) && !isa(atoms, AbstractGPUArray) throw(ArgumentError("the coordinates are on the GPU but the atoms are not")) end - if isa(atoms, CuArray) && !isa(vels, CuArray) + if isa(atoms, AbstractGPUArray) && !isa(vels, AbstractGPUArray) throw(ArgumentError("the atoms are on the GPU but the velocities are not")) end - if isa(vels, CuArray) && !isa(atoms, CuArray) + if isa(vels, AbstractGPUArray) && !isa(atoms, AbstractGPUArray) throw(ArgumentError("the velocities are on the GPU but the atoms are not")) end - if isa(atoms, CuArray) && length(constraints) > 0 + if isa(atoms, AbstractGPUArray) && length(constraints) > 0 @warn "Constraints are not currently compatible with simulation on the GPU" end @@ -596,7 +594,7 @@ function System(; check_units(atoms, coords, vels, energy_units, force_units, pairwise_inters, specific_inter_lists, general_inters, boundary) - return System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}( + return System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}( atoms, coords, boundary, vels, atoms_data, topology, pairwise_inters, specific_inter_lists, general_inters, constraints, neighbor_finder, loggers, df, force_units, energy_units, k_converted, atom_masses, data) @@ -847,7 +845,7 @@ construction where `n` is the number of threads to be used per replica. modified in some simulations. `k` is chosen based on the `energy_units` given. - `data::DA=nothing`: arbitrary data associated with the replica system. """ -mutable struct ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA} <: AtomsBase.AbstractSystem{D} +mutable struct ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA} <: AbstractSystem{D} atoms::A n_replicas::Int atoms_data::AD @@ -884,7 +882,8 @@ function ReplicaSystem(; k=default_k(energy_units), data=nothing) D = AtomsBase.n_dimensions(boundary) - G = isa(replica_coords[1], CuArray) + D = n_dimensions(boundary) + AT = get_array_type(replica_coords[1]) T = float_type(boundary) A = typeof(atoms) AD = typeof(atoms_data) @@ -995,25 +994,25 @@ function ReplicaSystem(; throw(ArgumentError("there are $(length(atoms)) atoms but $(length(atoms_data)) atom data entries")) end - n_cuarray = sum(y -> isa(y, CuArray), replica_coords) + n_cuarray = sum(y -> isa(y, AbstractGPUArray), replica_coords) if !(n_cuarray == n_replicas || n_cuarray == 0) throw(ArgumentError("the coordinates for $n_cuarray out of $n_replicas replicas are on GPU")) end - if isa(atoms, CuArray) && n_cuarray != n_replicas + if isa(atoms, AbstractGPUArray) && n_cuarray != n_replicas throw(ArgumentError("the atoms are on the GPU but the coordinates are not")) end - if n_cuarray == n_replicas && !isa(atoms, CuArray) + if n_cuarray == n_replicas && !isa(atoms, AbstractGPUArray) throw(ArgumentError("the coordinates are on the GPU but the atoms are not")) end - n_cuarray = sum(y -> isa(y, CuArray), replica_velocities) + n_cuarray = sum(y -> isa(y, AbstractGPUArray), replica_velocities) if !(n_cuarray == n_replicas || n_cuarray == 0) throw(ArgumentError("the velocities for $n_cuarray out of $n_replicas replicas are on GPU")) end - if isa(atoms, CuArray) && n_cuarray != n_replicas + if isa(atoms, AbstractGPUArray) && n_cuarray != n_replicas throw(ArgumentError("the atoms are on the GPU but the velocities are not")) end - if n_cuarray == n_replicas && !isa(atoms, CuArray) + if n_cuarray == n_replicas && !isa(atoms, AbstractGPUArray) throw(ArgumentError("the velocities are on the GPU but the atoms are not")) end @@ -1023,7 +1022,7 @@ function ReplicaSystem(; k_converted = convert_k_units(T, k, energy_units) K = typeof(k_converted) - replicas = Tuple(System{D, G, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]), + replicas = Tuple(System{D, AT, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]), typeof(replica_specific_inter_lists[i]), typeof(replica_general_inters[i]), typeof(replica_constraints[i]), NF, typeof(replica_loggers[i]), F, E, K, M, Nothing}( @@ -1034,7 +1033,7 @@ function ReplicaSystem(; force_units, energy_units, k_converted, atom_masses, nothing) for i in 1:n_replicas) R = typeof(replicas) - return ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA}( + return ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA}( atoms, n_replicas, atoms_data, exchange_logger, force_units, energy_units, k_converted, replicas, data) end @@ -1044,7 +1043,7 @@ end Whether a [`System`](@ref) or [`ReplicaSystem`](@ref) is on the GPU. """ -is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G +is_on_gpu(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT <: AbstractGPUArray """ float_type(sys) @@ -1052,7 +1051,7 @@ is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G The float type a [`System`](@ref), [`ReplicaSystem`](@ref) or bounding box uses. """ -float_type(::Union{System{D, G, T}, ReplicaSystem{D, G, T}}) where {D, G, T} = T +float_type(::Union{System{D, AT, T}, ReplicaSystem{D, AT, T}}) where {D, AT, T} = T """ masses(sys) @@ -1071,8 +1070,7 @@ charges(s::Union{System, ReplicaSystem}) = charge.(s.atoms) charge(s::Union{System, ReplicaSystem}, i::Integer) = charge(s.atoms[i]) # Move an array to the GPU depending on whether the system is on the GPU -move_array(arr, ::System{D, false}) where {D} = arr -move_array(arr, ::System{D, true }) where {D} = CuArray(arr) +move_array(arr, ::System{D, AT}) where {D, AT} = AT(arr) Base.getindex(s::Union{System, ReplicaSystem}, i::Union{Integer, AbstractVector}) = s.atoms[i] Base.length(s::Union{System, ReplicaSystem}) = length(s.atoms) diff --git a/test/Project.toml b/test/Project.toml index 3901cc98..e7e9a459 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a" AtomsBaseTesting = "ed7c10db-df7e-4efa-a7be-4f4190f7f227" diff --git a/test/basic.jl b/test/basic.jl index e549fae7..686de36b 100644 --- a/test/basic.jl +++ b/test/basic.jl @@ -176,16 +176,16 @@ @test mcs == [SVector(0.05, 0.0), SVector(1.0, 1.0)] ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "tip3p_standard.xml", "his.xml"])...) - for gpu in gpu_list - sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=gpu, use_cell_list=false) + for ArrayType in array_list + sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; ArrayType=ArrayType, use_cell_list=false) mcs = molecule_centers(sys.coords, sys.boundary, sys.topology) - @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.04u"nm") + @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.08u"nm") # Mark all pairs as ineligible for pairwise interactions and check that the # potential energy from the specific interactions does not change on scaling no_nbs = falses(length(sys), length(sys)) sys.neighbor_finder = DistanceNeighborFinder( - eligible=(gpu ? CuArray(no_nbs) : no_nbs), + eligible=(ArrayType(no_nbs)), dist_cutoff=1.0u"nm", ) coords_start = copy(sys.coords) @@ -310,8 +310,9 @@ end end end - if run_gpu_tests - sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=true) + if run_cuda_tests + sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; + ArrayType=CuArray) for neighbor_finder in (DistanceNeighborFinder,) nf_gpu = neighbor_finder( eligible=sys_gpu.neighbor_finder.eligible, @@ -320,7 +321,25 @@ end ) neighbors_gpu = find_neighbors(sys_gpu, nf_gpu) @test length(neighbors_gpu) == n_neighbors_ref - CUDA.allowscalar() do + GPUArrays.allowscalar() do + @test neighbors_gpu[10] isa Tuple{Int32, Int32, Bool} + end + @test identical_neighbors(neighbors_gpu, neighbors_ref) + end + end + + if run_rocm_tests + sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; + ArrayType=ROCArray) + for neighbor_finder in (DistanceNeighborFinder,) + nf_gpu = neighbor_finder( + eligible=sys_gpu.neighbor_finder.eligible, + special=sys_gpu.neighbor_finder.special, + dist_cutoff=dist_cutoff, + ) + neighbors_gpu = find_neighbors(sys_gpu, nf_gpu) + @test length(neighbors_gpu) == n_neighbors_ref + GPUArrays.allowscalar() do @test neighbors_gpu[10] isa Tuple{Int32, Int32, Bool} end @test identical_neighbors(neighbors_gpu, neighbors_ref) @@ -336,9 +355,13 @@ end coords_1 = SVector{3, Float64}.(eachcol(cm_1)) / 10 * u"nm" coords_2 = SVector{3, Float64}.(eachcol(cm_2)) / 10 * u"nm" @test rmsd(coords_1, coords_2) ≈ 2.54859467758795u"Å" - if run_gpu_tests + if run_cuda_tests @test rmsd(CuArray(coords_1), CuArray(coords_2)) ≈ 2.54859467758795u"Å" end + if run_rocm_tests + @test rmsd(ROCArray(coords_1), + ROCArray(coords_2)) ≈ 2.54859467758795u"Å" + end bb_atoms = BioStructures.collectatoms(struc[1], BioStructures.backboneselector) coords = SVector{3, Float64}.(eachcol(BioStructures.coordarray(bb_atoms))) / 10 * u"nm" diff --git a/test/energy_conservation.jl b/test/energy_conservation.jl index d64a4cf2..8ae21aad 100644 --- a/test/energy_conservation.jl +++ b/test/energy_conservation.jl @@ -6,7 +6,7 @@ using CUDA using Test @testset "Lennard-Jones energy conservation" begin - function test_energy_conservation(gpu::Bool, n_threads::Integer, n_steps::Integer) + function test_energy_conservation(ArrayType::AbstractArray, n_threads::Integer, n_steps::Integer) n_atoms = 2_000 atom_mass = 40.0u"g/mol" temp = 1.0u"K" @@ -26,8 +26,8 @@ using Test coords = place_atoms(n_atoms, boundary; min_dist=0.6u"nm") sys = System( - atoms=(gpu ? CuArray(atoms) : atoms), - coords=(gpu ? CuArray(coords) : coords), + atoms=(ArrayType(atoms) : atoms), + coords=(ArrayType(coords) : coords), boundary=boundary, pairwise_inters=(LennardJones(cutoff=cutoff, use_neighbors=false),), loggers=( @@ -56,11 +56,11 @@ using Test end end - test_energy_conservation(false, 1, 10_000) + test_energy_conservation(Array, 1, 10_000) if Threads.nthreads() > 1 - test_energy_conservation(false, Threads.nthreads(), 50_000) + test_energy_conservation(Array, Threads.nthreads(), 50_000) end if CUDA.functional() - test_energy_conservation(true, 1, 100_000) + test_energy_conservation(CuArray, 1, 100_000) end end diff --git a/test/minimization.jl b/test/minimization.jl index 83a10f0e..253c5e8b 100644 --- a/test/minimization.jl +++ b/test/minimization.jl @@ -42,14 +42,14 @@ @test isapprox(potential_energy(sys; n_threads=1) * u"kJ * mol^-1", -3.0u"kJ * mol^-1"; atol=1e-4u"kJ * mol^-1") - if run_gpu_tests - coords = CuArray([ + for ArrayType in array_list[2:end] + coords = ArrayType([ SVector(1.0, 1.0, 1.0)u"nm", SVector(1.6, 1.0, 1.0)u"nm", SVector(1.4, 1.6, 1.0)u"nm", ]) sys = System( - atoms=CuArray([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]), + atoms=ArrayType([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]), coords=coords, boundary=CubicBoundary(5.0u"nm"), pairwise_inters=(LennardJones(),), @@ -57,10 +57,12 @@ sim = SteepestDescentMinimizer(tol=1.0u"kJ * mol^-1 * nm^-1") simulate!(sys, sim) - dists = distances(sys.coords, sys.boundary) + dists = Array(distances(sys.coords, sys.boundary)) dists_flat = dists[triu(trues(3, 3), 1)] - @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-3u"nm"), dists_flat) + + # GPU tolerances are more lenient (possibly for f32 shenanigans) + @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-2u"nm"), dists_flat) @test isapprox(potential_energy(sys), -3.0u"kJ * mol^-1"; - atol=1e-4u"kJ * mol^-1") + atol=1e-2u"kJ * mol^-1") end end diff --git a/test/protein.jl b/test/protein.jl index 7153eb63..9d145d10 100644 --- a/test/protein.jl +++ b/test/protein.jl @@ -179,12 +179,12 @@ end @test pis_grad == sys_nounits.pairwise_inters # Test the same simulation on the GPU - if run_gpu_tests + for ArrayType in array_list[2:end] sys = System( joinpath(data_dir, "6mrr_equil.pdb"), ff; - velocities=CuArray(copy(velocities_start)), - gpu=true, + velocities=ArrayType(deepcopy(velocities_start)), + ArrayType = ArrayType, center_coords=false, ) @test kinetic_energy(sys) ≈ 65521.87288132431u"kJ * mol^-1" @@ -211,9 +211,9 @@ end sys_nounits = System( joinpath(data_dir, "6mrr_equil.pdb"), ff_nounits; - velocities=CuArray(copy(ustrip_vec.(velocities_start))), + velocities=ArrayType(deepcopy(ustrip_vec.(velocities_start))), units=false, - gpu=true, + ArrayType = ArrayType, center_coords=false, ) @test kinetic_energy(sys_nounits)u"kJ * mol^-1" ≈ 65521.87288132431u"kJ * mol^-1" @@ -248,13 +248,13 @@ end @testset "Implicit solvent" begin ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...) - for gpu in gpu_list + for ArrayType in array_list for solvent_model in ("obc2", "gbn2") sys = System( joinpath(data_dir, "6mrr_nowater.pdb"), ff; boundary=CubicBoundary(100.0u"nm"), - gpu=gpu, + ArrayType = ArrayType, dist_cutoff=5.0u"nm", dist_neighbors=5.0u"nm", implicit_solvent=solvent_model, diff --git a/test/runtests.jl b/test/runtests.jl index 00fc6654..3be6418e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,6 +7,8 @@ using AtomsCalculators.AtomsCalculatorsTesting import BioStructures # Imported to avoid clashing names using CUDA using Enzyme +using AMDGPU +using GPUArrays using FiniteDifferences using KernelDensity import SimpleCrystals @@ -52,6 +54,7 @@ end # Allow CUDA device to be specified const DEVICE = parse(Int, get(ENV, "DEVICE", "0")) +<<<<<<< HEAD const run_gpu_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional() const gpu_list = (run_gpu_tests ? (false, true) : (false,)) if run_gpu_tests @@ -59,8 +62,27 @@ if run_gpu_tests @info "The GPU tests will be run on device $DEVICE" elseif get(ENV, "GPUTESTS", "1") == "0" @warn "The GPU tests will not be run as GPUTESTS is set to 0" +======= +const run_cuda_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional() +const run_rocm_tests = get(ENV, "GPUTESTS", "1") != "0" && AMDGPU.functional() + +array_list = (Array,) + +if run_cuda_tests + array_list = (array_list..., CuArray) + device!(parse(Int, DEVICE)) + @info "The CUDA tests will be run on device $DEVICE" +>>>>>>> c820f41f (Adding KernelAbstractions tooling for Molly and tests) +else + @warn "The CUDA tests will not be run as a CUDA-enabled device is not available" +end + +if run_rocm_tests + array_list = (array_list..., ROCArray) + AMDGPU.device!(AMDGPU.devices()[parse(Int, DEVICE)+1]) + @info "The ROCM tests will be run on device $DEVICE" else - @warn "The GPU tests will not be run as a CUDA-enabled device is not available" + @warn "The ROCM tests will not be run as a ROCM-enabled device is not available" end const data_dir = normpath(@__DIR__, "..", "data") diff --git a/test/simulation.jl b/test/simulation.jl index c9ada6ef..4a74055f 100644 --- a/test/simulation.jl +++ b/test/simulation.jl @@ -424,7 +424,7 @@ end end @testset "Position restraints" begin - for gpu in gpu_list + for ArrayType in array_list n_atoms = 10 n_atoms_res = n_atoms ÷ 2 n_steps = 2_000 @@ -435,8 +435,8 @@ end sim = Langevin(dt=0.001u"ps", temperature=300.0u"K", friction=1.0u"ps^-1") sys = System( - atoms=(gpu ? CuArray(atoms) : atoms), - coords=(gpu ? CuArray(copy(starting_coords)) : copy(starting_coords)), + atoms=ArrayType(atoms), + coords=ArrayType(deepcopy(starting_coords)), boundary=boundary, atoms_data=atoms_data, pairwise_inters=(LennardJones(),), @@ -925,15 +925,14 @@ end vvand_baro = VelocityVerlet(dt=dt, coupling=(AndersenThermostat(temp, 1.0u"ps"), barostat)) for sim in (lang_baro, vvand_baro) - for gpu in gpu_list - if gpu && sim == vvand_baro + for ArrayType in array_list + if ArrayType <: AbstractGPUArray && sim == vvand_baro continue end - AT = gpu ? CuArray : Array sys = System( - atoms=AT(atoms), - coords=AT(copy(coords)), + atoms=ArrayType(atoms), + coords=ArrayType(deepcopy(coords)), boundary=boundary, pairwise_inters=(LennardJones(),), loggers=( @@ -988,16 +987,15 @@ end SVector(nothing , nothing , nothing ), # Uncoupled ) - for gpu in gpu_list - AT = gpu ? CuArray : Array + for ArrayType in array_list for (press_i, press) in enumerate(pressure_test_set) - if gpu && press_i != 2 + if ArrayType <: AbstractGPUArray && press_i != 2 continue end sys = System( - atoms=AT(atoms), - coords=AT(copy(coords)), + atoms=ArrayType(atoms), + coords=ArrayType(deepcopy(coords)), boundary=boundary, pairwise_inters=(LennardJones(),), loggers=( @@ -1056,16 +1054,15 @@ end MonteCarloMembraneBarostat(press, tens, temp, boundary; z_axis_fixed=true), ) - for gpu in gpu_list - AT = gpu ? CuArray : Array + for ArrayType in array_list for (barostat_i, barostat) in enumerate(barostat_test_set) - if gpu && barostat_i != 2 + if ArrayType <: AbstractGPUArray && barostat_i != 2 continue end sys = System( - atoms=AT(atoms), - coords=AT(copy(coords)), + atoms=ArrayType(atoms), + coords=ArrayType(deepcopy(coords)), boundary=boundary, pairwise_inters=(LennardJones(),), loggers=( @@ -1179,7 +1176,8 @@ end starting_coords_f32 = [Float32.(c) for c in starting_coords] starting_velocities_f32 = [Float32.(c) for c in starting_velocities] - function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) + function test_sim(nl::Bool, parallel::Bool, f32::Bool, + ArrayType::Type{AT}) where AT <: AbstractArray n_atoms = 400 n_steps = 200 atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol" @@ -1189,9 +1187,9 @@ end r0 = f32 ? 0.2f0u"nm" : 0.2u"nm" bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)] specific_inter_lists = (InteractionList2Atoms( - gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)), - gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)), - gpu ? CuArray(bonds) : bonds, + ArrayType(Int32.(collect(1:2:n_atoms))), + ArrayType(Int32.(collect(2:2:n_atoms))), + ArrayType(bonds), ),) neighbor_finder = NoNeighborFinder() @@ -1199,7 +1197,7 @@ end pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),) if nl neighbor_finder = DistanceNeighborFinder( - eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms), + eligible=ArrayType(trues(n_atoms, n_atoms)), n_steps=10, dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm", ) @@ -1207,17 +1205,10 @@ end end show(devnull, neighbor_finder) - if gpu - coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords)) - velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities)) - atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", - ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]) - else - coords = copy(f32 ? starting_coords_f32 : starting_coords) - velocities = copy(f32 ? starting_velocities_f32 : starting_velocities) - atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", - ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms] - end + coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords)) + velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities)) + atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", + ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]) s = System( atoms=atoms, @@ -1229,7 +1220,7 @@ end neighbor_finder=neighbor_finder, ) - @test is_on_gpu(s) == gpu + @test is_on_gpu(s) == (ArrayType <: AbstractGPUArray) @test float_type(s) == (f32 ? Float32 : Float64) n_threads = parallel ? Threads.nthreads() : 1 @@ -1240,23 +1231,30 @@ end end runs = [ - ("CPU" , [false, false, false, false]), - ("CPU f32" , [false, false, true , false]), - ("CPU NL" , [true , false, false, false]), - ("CPU f32 NL", [true , false, true , false]), + ("CPU" , [false, false, false, Array]), + ("CPU f32" , [false, false, true , Array]), + ("CPU NL" , [true , false, false, Array]), + ("CPU f32 NL", [true , false, true , Array]), ] if run_parallel_tests - push!(runs, ("CPU parallel" , [false, true , false, false])) - push!(runs, ("CPU parallel f32" , [false, true , true , false])) - push!(runs, ("CPU parallel NL" , [true , true , false, false])) - push!(runs, ("CPU parallel f32 NL", [true , true , true , false])) + push!(runs, ("CPU parallel" , [false, true , false, Array])) + push!(runs, ("CPU parallel f32" , [false, true , true , Array])) + push!(runs, ("CPU parallel NL" , [true , true , false, Array])) + push!(runs, ("CPU parallel f32 NL", [true , true , true , Array])) end - if run_gpu_tests - push!(runs, ("GPU" , [false, false, false, true])) - push!(runs, ("GPU f32" , [false, false, true , true])) - push!(runs, ("GPU NL" , [true , false, false, true])) - push!(runs, ("GPU f32 NL", [true , false, true , true])) + if run_cuda_tests + push!(runs, ("GPU" , [false, false, false, CuArray])) + push!(runs, ("GPU f32" , [false, false, true , CuArray])) + push!(runs, ("GPU NL" , [true , false, false, CuArray])) + push!(runs, ("GPU f32 NL", [true , false, true , CuArray])) end + if run_rocm_tests + push!(runs, ("GPU" , [false, false, false, ROCArray])) + push!(runs, ("GPU f32" , [false, false, true , ROCArray])) + push!(runs, ("GPU NL" , [true , false, false, ROCArray])) + push!(runs, ("GPU f32 NL", [true , false, true , ROCArray])) + end + final_coords_ref, E_start_ref = test_sim(runs[1][2]...) # Check all simulations give the same result to within some error