Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KernelAbstractions support #147

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ jobs:
- NotGradients
- Gradients
steps:
- run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
*.jl.*.cov
*.jl.mem
docs/build
/Manifest.toml
*Manifest.toml
benchmark/tune.json
benchmark/results
.vscode/settings.json
7 changes: 6 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
Expand All @@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
Expand All @@ -32,6 +33,7 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
Expand All @@ -40,6 +42,7 @@ PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"

[extensions]
MollyEnzymeExt = "Enzyme"
MollyCUDAExt = "CUDA"
MollyGLMakieExt = ["GLMakie", "Colors"]
MollyKernelDensityExt = "KernelDensity"
MollyPythonCallExt = "PythonCall"
Expand All @@ -61,7 +64,9 @@ Enzyme = "0.13"
EzXML = "1"
FLoops = "0.2"
GLMakie = "0.8, 0.9, 0.10"
GPUArrays = "10"
Graphs = "1.8"
KernelAbstractions = "0.9"
KernelDensity = "0.5, 0.6"
LinearAlgebra = "1.9"
NearestNeighbors = "0.4"
Expand Down
52 changes: 23 additions & 29 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
const starting_coords_f32 = [Float32.(c) for c in starting_coords]
const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]

function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
function test_sim(nl::Bool, parallel::Bool, f32::Bool,
ArrayType::Type{AT}) where AT <: AbstractArray
n_atoms = 400
n_steps = 200
atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
Expand All @@ -72,34 +73,27 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
specific_inter_lists = (InteractionList2Atoms(
gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
gpu ? CuArray(bonds) : bonds,
ArrayType(Int32.(collect(1:2:n_atoms))),
ArrayType(Int32.(collect(2:2:n_atoms))),
ArrayType(bonds),
),)

neighbor_finder = NoNeighborFinder()
cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
if nl
neighbor_finder = DistanceNeighborFinder(
eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
eligible=ArrayType(trues(n_atoms, n_atoms)),
n_steps=10,
dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
)
pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
end

if gpu
coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
else
coords = copy(f32 ? starting_coords_f32 : starting_coords)
velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
end
coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords))
velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])

sys = System(
atoms=atoms,
Expand All @@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
end

runs = [
("CPU" , [false, false, false, false]),
("CPU f32" , [false, false, true , false]),
("CPU NL" , [true , false, false, false]),
("CPU f32 NL", [true , false, true , false]),
("CPU" , [false, false, false, Array]),
("CPU f32" , [false, false, true , Array]),
("CPU NL" , [true , false, false, Array]),
("CPU f32 NL", [true , false, true , Array]),
]
if run_parallel_tests
push!(runs, ("CPU parallel" , [false, true , false, false]))
push!(runs, ("CPU parallel f32" , [false, true , true , false]))
push!(runs, ("CPU parallel NL" , [true , true , false, false]))
push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
push!(runs, ("CPU parallel" , [false, true , false, Array]))
push!(runs, ("CPU parallel f32" , [false, true , true , Array]))
push!(runs, ("CPU parallel NL" , [true , true , false, Array]))
push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
end
if run_gpu_tests
push!(runs, ("GPU" , [false, false, false, true]))
push!(runs, ("GPU f32" , [false, false, true , true]))
push!(runs, ("GPU NL" , [true , false, false, true]))
push!(runs, ("GPU f32 NL", [true , false, true , true]))
if run_cuda_tests
push!(runs, ("GPU" , [false, false, false, CuArray]))
push!(runs, ("GPU f32" , [false, false, true , CuArray]))
push!(runs, ("GPU NL" , [true , false, false, CuArray]))
push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
end

for (name, args) in runs
Expand Down
22 changes: 11 additions & 11 deletions benchmark/protein.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
const ff_dir = joinpath(data_dir, "force_fields")
const openmm_dir = joinpath(data_dir, "openmm_6mrr")

function setup_system(gpu::Bool, f32::Bool, units::Bool)
function setup_system(ArrayType::AbstractArray, f32::Bool, units::Bool)
T = f32 ? Float32 : Float64
ff = MolecularForceField(
T,
Expand All @@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
sys = System(
joinpath(data_dir, "6mrr_equil.pdb"),
ff;
velocities=gpu ? CuArray(velocities) : velocities,
velocities=ArrayType(velocities),
units=units,
gpu=gpu,
dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
Expand All @@ -42,15 +42,15 @@ end

runs = [
# run_name gpu parr f32 units
("CPU 1 thread" , false, false, false, true ),
("CPU 1 thread f32" , false, false, true , true ),
("CPU 1 thread f32 nounits" , false, false, true , false),
("CPU $n_threads threads" , false, true , false, true ),
("CPU $n_threads threads f32" , false, true , true , true ),
("CPU $n_threads threads f32 nounits", false, true , true , false),
("GPU" , true , false, false, true ),
("GPU f32" , true , false, true , true ),
("GPU f32 nounits" , true , false, true , false),
("CPU 1 thread" , Array, false, false, true ),
("CPU 1 thread f32" , Array, false, true , true ),
("CPU 1 thread f32 nounits" , Array, false, true , false),
("CPU $n_threads threads" , Array, true , false, true ),
("CPU $n_threads threads f32" , Array, true , true , true ),
("CPU $n_threads threads f32 nounits", Array, true , true , false),
("GPU" , CuArray, false, false, true ),
("GPU f32" , CuArray, false, true , true ),
("GPU f32 nounits" , CuArray, false, true , false),
]

for (run_name, gpu, parallel, f32, units) in runs
Expand Down
11 changes: 11 additions & 0 deletions src/cuda.jl → ext/MollyCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
module MollyCUDAExt

using Molly
using CUDA
using ChainRulesCore
using Atomix

CUDA.Const(nl::Molly.NoNeighborList) = nl

# CUDA.jl kernels
const WARPSIZE = UInt32(32)

Expand Down Expand Up @@ -513,3 +522,5 @@ function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_v
end
return nothing
end

end # module
2 changes: 1 addition & 1 deletion ext/MollyGLMakieExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ module MollyGLMakieExt
using Molly
import AtomsBase
using GLMakie
using Colors
using Unitful
using Colors

using LinearAlgebra

Expand Down
6 changes: 3 additions & 3 deletions ext/MollyPythonCallExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ module MollyPythonCallExt
using Molly
using PythonCall
import AtomsCalculators
using CUDA
using GPUArrays
using StaticArrays
using Unitful

Expand Down Expand Up @@ -91,7 +91,7 @@ end

uconvert_vec(x...) = uconvert.(x...)

function AtomsCalculators.forces(sys::System{D, G, T},
function AtomsCalculators.forces(sys::System{D, AT, T},
ase_calc::ASECalculator;
kwargs...) where {D, G, T}
update_ase_calc!(ase_calc, sys)
Expand All @@ -105,7 +105,7 @@ function AtomsCalculators.forces(sys::System{D, G, T},
else
fs_unit = uconvert_vec.(sys.force_units, fs * u"eV/Å")
end
return G ? CuArray(fs_unit) : fs_unit
return AT <: AbstractGPUArray ? AT(fs_unit) : fs_unit
end

function AtomsCalculators.potential_energy(sys::System{D, G, T},
Expand Down
5 changes: 3 additions & 2 deletions src/Molly.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ import BioStructures # Imported to avoid clashing names
using CellListMap
import Chemfiles
using Combinatorics
using CUDA
using KernelAbstractions
using GPUArrays
using DataStructures
using Distances
using Distributions
Expand All @@ -34,7 +35,7 @@ include("types.jl")
include("units.jl")
include("spatial.jl")
include("cutoffs.jl")
include("cuda.jl")
include("kernels.jl")
include("force.jl")
include("interactions/lennard_jones.jl")
include("interactions/soft_sphere.jl")
Expand Down
1 change: 1 addition & 0 deletions src/analysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ function hydrodynamic_radius(coords::AbstractArray{SVector{D, T}}, boundary) whe
n_atoms = length(coords)
diag_cpu = Diagonal(ones(T, n_atoms))
diag = isa(coords, CuArray) ? CuArray(diag_cpu) : diag_cpu
diag = isa(coords, AbstractGPUArray) ? get_array_type(coords)((diag_cpu)) : diag_cpu
dists = distances(coords, boundary) .+ diag
sum_inv_dists = sum(inv.(dists)) - sum(inv(diag))
inv_R_hyd = sum_inv_dists / (2 * n_atoms^2)
Expand Down
8 changes: 4 additions & 4 deletions src/coupling.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ struct AndersenThermostat{T, C}
coupling_const::C
end

function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat, sim,
function apply_coupling!(sys::System{D, AT}, thermostat::AndersenThermostat, sim,
neighbors=nothing, step_n::Integer=0;
n_threads::Integer=Threads.nthreads()) where D
n_threads::Integer=Threads.nthreads()) where {D, AT}
for i in eachindex(sys)
if rand() < (sim.dt / thermostat.coupling_const)
sys.velocities[i] = random_velocity(mass(sys.atoms[i]), thermostat.temperature, sys.k;
Expand All @@ -70,9 +70,9 @@ function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat,
return false
end

function apply_coupling!(sys::System{D, true, T}, thermostat::AndersenThermostat, sim,
function apply_coupling!(sys::System{D, AT, T}, thermostat::AndersenThermostat, sim,
neighbors=nothing, step_n::Integer=0;
n_threads::Integer=Threads.nthreads()) where {D, T}
n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
atoms_to_bump = T.(rand(length(sys)) .< (sim.dt / thermostat.coupling_const))
atoms_to_leave = one(T) .- atoms_to_bump
atoms_to_bump_dev = move_array(atoms_to_bump, sys)
Expand Down
11 changes: 6 additions & 5 deletions src/energy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ function potential_energy(sys; n_threads::Integer=Threads.nthreads())
return potential_energy(sys, find_neighbors(sys; n_threads=n_threads); n_threads=n_threads)
end

function potential_energy(sys::System{D, false, T}, neighbors, step_n::Integer=0;
n_threads::Integer=Threads.nthreads()) where {D, T}
function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
n_threads::Integer=Threads.nthreads()) where {D, AT, T}
pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
pairwise_inters_nl = filter( use_neighbors, values(sys.pairwise_inters))
sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists))
Expand Down Expand Up @@ -247,10 +247,11 @@ function specific_pe(atoms, coords, velocities, boundary, energy_units, sils_1_a
return pe
end

function potential_energy(sys::System{D, true, T}, neighbors, step_n::Integer=0;
n_threads::Integer=Threads.nthreads()) where {D, T}
pe_vec_nounits = CUDA.zeros(T, 1)
function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
n_atoms = length(sys)
val_ft = Val(T)
pe_vec_nounits = KernelAbstractions.zeros(get_backend(sys.coords), T, 1)

pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
if length(pairwise_inters_nonl) > 0
Expand Down
9 changes: 5 additions & 4 deletions src/force.jl
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,9 @@ function init_forces_buffer(forces_nounits, n_threads)
end
end

function init_forces_buffer(forces_nounits::CuArray{SVector{D, T}}, n_threads) where {D, T}
return CUDA.zeros(T, D, length(forces_nounits))
function init_forces_buffer(forces_nounits::AbstractGPUArray{SVector{D, T}}, n_threads) where {D, T}
return KernelAbstractions.zeros(get_backend(forces_nounits),
T, D, length(forces_nounits))
end

"""
Expand Down Expand Up @@ -346,9 +347,9 @@ function specific_forces!(fs_nounits, atoms, coords, velocities, boundary, force
return fs_nounits
end

function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors,
function forces_nounits!(fs_nounits, sys::System{D, AT, T}, neighbors,
fs_mat=CUDA.zeros(T, D, length(sys)), step_n::Integer=0;
n_threads::Integer=Threads.nthreads()) where {D, T}
n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
fill!(fs_mat, zero(T))
val_ft = Val(T)

Expand Down
Loading
Loading