From a173ac2b26dc3523e6b4e6207961357c30b94668 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Wed, 6 Sep 2023 16:12:31 +0200
Subject: [PATCH] Adding KernelAbstractions tooling for Molly and tests

---
 .github/workflows/CI.yml             |   1 +
 .gitignore                           |   2 +-
 Project.toml                         |   7 +-
 benchmark/benchmarks.jl              |  52 ++--
 benchmark/protein.jl                 |  22 +-
 src/cuda.jl => ext/MollyCUDAExt.jl   |  11 +
 ext/MollyGLMakieExt.jl               |   2 +-
 ext/MollyPythonCallExt.jl            |   6 +-
 src/Molly.jl                         |   5 +-
 src/analysis.jl                      |   1 +
 src/coupling.jl                      |   8 +-
 src/energy.jl                        |  11 +-
 src/force.jl                         |   9 +-
 src/interactions/implicit_solvent.jl | 120 +++++-----
 src/kernels.jl                       | 345 +++++++++++++++++++++++++++
 src/neighbors.jl                     |  43 ++--
 src/setup.jl                         |  98 ++++----
 src/simulators.jl                    |  20 +-
 src/spatial.jl                       |  17 +-
 src/types.jl                         |  48 ++--
 test/Project.toml                    |   1 +
 test/basic.jl                        |  39 ++-
 test/energy_conservation.jl          |  12 +-
 test/minimization.jl                 |  14 +-
 test/protein.jl                      |  14 +-
 test/runtests.jl                     |  24 +-
 test/simulation.jl                   |  94 ++++----
 27 files changed, 709 insertions(+), 317 deletions(-)
 rename src/cuda.jl => ext/MollyCUDAExt.jl (99%)
 create mode 100644 src/kernels.jl

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 7bb822ad..3145e136 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -29,6 +29,7 @@ jobs:
           - NotGradients
           - Gradients
     steps:
+      - run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:
diff --git a/.gitignore b/.gitignore
index 293442ed..697b7041 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 *.jl.*.cov
 *.jl.mem
 docs/build
-/Manifest.toml
+*Manifest.toml
 benchmark/tune.json
 benchmark/results
 .vscode/settings.json
diff --git a/Project.toml b/Project.toml
index 1adea086..262fa0b2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
 BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
 Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
@@ -32,6 +33,7 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
@@ -40,6 +42,7 @@ PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
 [extensions]
 MollyEnzymeExt = "Enzyme"
+MollyCUDAExt = "CUDA"
 MollyGLMakieExt = ["GLMakie", "Colors"]
 MollyKernelDensityExt = "KernelDensity"
 MollyPythonCallExt = "PythonCall"
@@ -61,7 +64,9 @@ Enzyme = "0.13"
 EzXML = "1"
 FLoops = "0.2"
 GLMakie = "0.8, 0.9, 0.10"
+GPUArrays = "10"
 Graphs = "1.8"
+KernelAbstractions = "0.9"
 KernelDensity = "0.5, 0.6"
 LinearAlgebra = "1.9"
 NearestNeighbors = "0.4"
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 08e6c5b4..86b0deea 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
 const starting_coords_f32 = [Float32.(c) for c in starting_coords]
 const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+function test_sim(nl::Bool, parallel::Bool, f32::Bool,
+                  ArrayType::Type{AT}) where AT <: AbstractArray
     n_atoms = 400
     n_steps = 200
     atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -72,9 +73,9 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
     bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
     specific_inter_lists = (InteractionList2Atoms(
-        gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-        gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-        gpu ? CuArray(bonds) : bonds,
+        ArrayType(Int32.(collect(1:2:n_atoms))),
+        ArrayType(Int32.(collect(2:2:n_atoms))),
+        ArrayType(bonds),
     ),)
 
     neighbor_finder = NoNeighborFinder()
@@ -82,24 +83,17 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
     if nl
         neighbor_finder = DistanceNeighborFinder(
-            eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+            eligible=ArrayType(trues(n_atoms, n_atoms)),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
         )
         pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
     end
 
-    if gpu
-        coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
-        velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
-        atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                              ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-    else
-        coords = copy(f32 ? starting_coords_f32 : starting_coords)
-        velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
-        atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                      ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-    end
+    coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords))
+    velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
+    atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                            ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
     sys = System(
         atoms=atoms,
@@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
 end
 
 runs = [
-    ("CPU"       , [false, false, false, false]),
-    ("CPU f32"   , [false, false, true , false]),
-    ("CPU NL"    , [true , false, false, false]),
-    ("CPU f32 NL", [true , false, true , false]),
+    ("CPU"       , [false, false, false, Array]),
+    ("CPU f32"   , [false, false, true , Array]),
+    ("CPU NL"    , [true , false, false, Array]),
+    ("CPU f32 NL", [true , false, true , Array]),
 ]
 if run_parallel_tests
-    push!(runs, ("CPU parallel"       , [false, true , false, false]))
-    push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-    push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-    push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+    push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+    push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+    push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+    push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
 end
-if run_gpu_tests
-    push!(runs, ("GPU"       , [false, false, false, true]))
-    push!(runs, ("GPU f32"   , [false, false, true , true]))
-    push!(runs, ("GPU NL"    , [true , false, false, true]))
-    push!(runs, ("GPU f32 NL", [true , false, true , true]))
+if run_cuda_tests
+    push!(runs, ("GPU"       , [false, false, false, CuArray]))
+    push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
+    push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
+    push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
 end
 
 for (name, args) in runs
diff --git a/benchmark/protein.jl b/benchmark/protein.jl
index 30f512c0..2bef6ff5 100644
--- a/benchmark/protein.jl
+++ b/benchmark/protein.jl
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
 const ff_dir = joinpath(data_dir, "force_fields")
 const openmm_dir = joinpath(data_dir, "openmm_6mrr")
 
-function setup_system(gpu::Bool, f32::Bool, units::Bool)
+function setup_system(ArrayType::AbstractArray, f32::Bool, units::Bool)
     T = f32 ? Float32 : Float64
     ff = MolecularForceField(
         T,
@@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
     sys = System(
         joinpath(data_dir, "6mrr_equil.pdb"),
         ff;
-        velocities=gpu ? CuArray(velocities) : velocities,
+        velocities=ArrayType(velocities),
         units=units,
         gpu=gpu,
         dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
@@ -42,15 +42,15 @@ end
 
 runs = [
     # run_name                             gpu    parr   f32    units
-    ("CPU 1 thread"                      , false, false, false, true ),
-    ("CPU 1 thread f32"                  , false, false, true , true ),
-    ("CPU 1 thread f32 nounits"          , false, false, true , false),
-    ("CPU $n_threads threads"            , false, true , false, true ),
-    ("CPU $n_threads threads f32"        , false, true , true , true ),
-    ("CPU $n_threads threads f32 nounits", false, true , true , false),
-    ("GPU"                               , true , false, false, true ),
-    ("GPU f32"                           , true , false, true , true ),
-    ("GPU f32 nounits"                   , true , false, true , false),
+    ("CPU 1 thread"                      , Array, false, false, true ),
+    ("CPU 1 thread f32"                  , Array, false, true , true ),
+    ("CPU 1 thread f32 nounits"          , Array, false, true , false),
+    ("CPU $n_threads threads"            , Array, true , false, true ),
+    ("CPU $n_threads threads f32"        , Array, true , true , true ),
+    ("CPU $n_threads threads f32 nounits", Array, true , true , false),
+    ("GPU"                               , CuArray, false, false, true ),
+    ("GPU f32"                           , CuArray, false, true , true ),
+    ("GPU f32 nounits"                   , CuArray, false, true , false),
 ]
 
 for (run_name, gpu, parallel, f32, units) in runs
diff --git a/src/cuda.jl b/ext/MollyCUDAExt.jl
similarity index 99%
rename from src/cuda.jl
rename to ext/MollyCUDAExt.jl
index 65ba1ede..90ad4897 100644
--- a/src/cuda.jl
+++ b/ext/MollyCUDAExt.jl
@@ -1,3 +1,12 @@
+module MollyCUDAExt
+
+using Molly
+using CUDA
+using ChainRulesCore
+using Atomix
+
+CUDA.Const(nl::Molly.NoNeighborList) = nl
+
 # CUDA.jl kernels
 const WARPSIZE = UInt32(32)
 
@@ -513,3 +522,5 @@ function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_v
     end
     return nothing
 end
+
+end # module
diff --git a/ext/MollyGLMakieExt.jl b/ext/MollyGLMakieExt.jl
index 5509ddde..fa7a4909 100644
--- a/ext/MollyGLMakieExt.jl
+++ b/ext/MollyGLMakieExt.jl
@@ -6,8 +6,8 @@ module MollyGLMakieExt
 using Molly
 import AtomsBase
 using GLMakie
-using Colors
 using Unitful
+using Colors
 
 using LinearAlgebra
 
diff --git a/ext/MollyPythonCallExt.jl b/ext/MollyPythonCallExt.jl
index e1afaeb8..9d0a26bf 100644
--- a/ext/MollyPythonCallExt.jl
+++ b/ext/MollyPythonCallExt.jl
@@ -6,7 +6,7 @@ module MollyPythonCallExt
 using Molly
 using PythonCall
 import AtomsCalculators
-using CUDA
+using GPUArrays
 using StaticArrays
 using Unitful
 
@@ -91,7 +91,7 @@ end
 
 uconvert_vec(x...) = uconvert.(x...)
 
-function AtomsCalculators.forces(sys::System{D, G, T},
+function AtomsCalculators.forces(sys::System{D, AT, T},
                                  ase_calc::ASECalculator;
                                  kwargs...) where {D, G, T}
     update_ase_calc!(ase_calc, sys)
@@ -105,7 +105,7 @@ function AtomsCalculators.forces(sys::System{D, G, T},
     else
         fs_unit = uconvert_vec.(sys.force_units, fs * u"eV/Å")
     end
-    return G ? CuArray(fs_unit) : fs_unit
+    return AT <: AbstractGPUArray ? AT(fs_unit) : fs_unit
 end
 
 function AtomsCalculators.potential_energy(sys::System{D, G, T},
diff --git a/src/Molly.jl b/src/Molly.jl
index 19664deb..08026b18 100644
--- a/src/Molly.jl
+++ b/src/Molly.jl
@@ -11,7 +11,8 @@ import BioStructures # Imported to avoid clashing names
 using CellListMap
 import Chemfiles
 using Combinatorics
-using CUDA
+using KernelAbstractions
+using GPUArrays
 using DataStructures
 using Distances
 using Distributions
@@ -34,7 +35,7 @@ include("types.jl")
 include("units.jl")
 include("spatial.jl")
 include("cutoffs.jl")
-include("cuda.jl")
+include("kernels.jl")
 include("force.jl")
 include("interactions/lennard_jones.jl")
 include("interactions/soft_sphere.jl")
diff --git a/src/analysis.jl b/src/analysis.jl
index 01429b5a..08e68d2a 100644
--- a/src/analysis.jl
+++ b/src/analysis.jl
@@ -90,6 +90,7 @@ function hydrodynamic_radius(coords::AbstractArray{SVector{D, T}}, boundary) whe
     n_atoms = length(coords)
     diag_cpu = Diagonal(ones(T, n_atoms))
     diag = isa(coords, CuArray) ? CuArray(diag_cpu) : diag_cpu
+    diag = isa(coords, AbstractGPUArray) ? get_array_type(coords)((diag_cpu)) : diag_cpu
     dists = distances(coords, boundary) .+ diag
     sum_inv_dists = sum(inv.(dists)) - sum(inv(diag))
     inv_R_hyd = sum_inv_dists / (2 * n_atoms^2)
diff --git a/src/coupling.jl b/src/coupling.jl
index 5168580c..1cca8a01 100644
--- a/src/coupling.jl
+++ b/src/coupling.jl
@@ -58,9 +58,9 @@ struct AndersenThermostat{T, C}
     coupling_const::C
 end
 
-function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat, sim,
+function apply_coupling!(sys::System{D, AT}, thermostat::AndersenThermostat, sim,
                          neighbors=nothing, step_n::Integer=0;
-                         n_threads::Integer=Threads.nthreads()) where D
+                         n_threads::Integer=Threads.nthreads()) where {D, AT}
     for i in eachindex(sys)
         if rand() < (sim.dt / thermostat.coupling_const)
             sys.velocities[i] = random_velocity(mass(sys.atoms[i]), thermostat.temperature, sys.k;
@@ -70,9 +70,9 @@ function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat,
     return false
 end
 
-function apply_coupling!(sys::System{D, true, T}, thermostat::AndersenThermostat, sim,
+function apply_coupling!(sys::System{D, AT, T}, thermostat::AndersenThermostat, sim,
                          neighbors=nothing, step_n::Integer=0;
-                         n_threads::Integer=Threads.nthreads()) where {D, T}
+                         n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
     atoms_to_bump = T.(rand(length(sys)) .< (sim.dt / thermostat.coupling_const))
     atoms_to_leave = one(T) .- atoms_to_bump
     atoms_to_bump_dev = move_array(atoms_to_bump, sys)
diff --git a/src/energy.jl b/src/energy.jl
index c0f10c6a..d0403c49 100644
--- a/src/energy.jl
+++ b/src/energy.jl
@@ -72,8 +72,8 @@ function potential_energy(sys; n_threads::Integer=Threads.nthreads())
     return potential_energy(sys, find_neighbors(sys; n_threads=n_threads); n_threads=n_threads)
 end
 
-function potential_energy(sys::System{D, false, T}, neighbors, step_n::Integer=0;
-                          n_threads::Integer=Threads.nthreads()) where {D, T}
+function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
+                          n_threads::Integer=Threads.nthreads()) where {D, AT, T}
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     pairwise_inters_nl   = filter( use_neighbors, values(sys.pairwise_inters))
     sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists))
@@ -247,10 +247,11 @@ function specific_pe(atoms, coords, velocities, boundary, energy_units, sils_1_a
     return pe
 end
 
-function potential_energy(sys::System{D, true, T}, neighbors, step_n::Integer=0;
-                          n_threads::Integer=Threads.nthreads()) where {D, T}
-    pe_vec_nounits = CUDA.zeros(T, 1)
+function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
+                          n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
+    n_atoms = length(sys)
     val_ft = Val(T)
+    pe_vec_nounits = KernelAbstractions.zeros(get_backend(sys.coords), T, 1)
 
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     if length(pairwise_inters_nonl) > 0
diff --git a/src/force.jl b/src/force.jl
index 760089a5..52bad345 100644
--- a/src/force.jl
+++ b/src/force.jl
@@ -123,8 +123,9 @@ function init_forces_buffer(forces_nounits, n_threads)
     end
 end
 
-function init_forces_buffer(forces_nounits::CuArray{SVector{D, T}}, n_threads) where {D, T}
-    return CUDA.zeros(T, D, length(forces_nounits))
+function init_forces_buffer(forces_nounits::AbstractGPUArray{SVector{D, T}}, n_threads) where {D, T}
+    return KernelAbstractions.zeros(get_backend(forces_nounits),
+                                    T, D, length(forces_nounits))
 end
 
 """
@@ -346,9 +347,9 @@ function specific_forces!(fs_nounits, atoms, coords, velocities, boundary, force
     return fs_nounits
 end
 
-function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors,
+function forces_nounits!(fs_nounits, sys::System{D, AT, T}, neighbors,
                          fs_mat=CUDA.zeros(T, D, length(sys)), step_n::Integer=0;
-                         n_threads::Integer=Threads.nthreads()) where {D, T}
+                         n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
     fill!(fs_mat, zero(T))
     val_ft = Val(T)
 
diff --git a/src/interactions/implicit_solvent.jl b/src/interactions/implicit_solvent.jl
index 44e81da8..84ba7f08 100644
--- a/src/interactions/implicit_solvent.jl
+++ b/src/interactions/implicit_solvent.jl
@@ -411,10 +411,11 @@ function ImplicitSolventOBC(atoms::AbstractArray{Atom{TY, M, T, D, E}},
         factor_solvent = zero(T(coulomb_const_units))
     end
 
-    if isa(atoms, CuArray)
-        or = CuArray(offset_radii)
-        sor = CuArray(scaled_offset_radii)
-        is, js = CuArray(inds_i), CuArray(inds_j)
+    if isa(atoms, AbstractGPUArray)
+        ArrayType = get_array_type(atoms)
+        or = ArrayType(offset_radii)
+        sor = ArrayType(scaled_offset_radii)
+        is, js = ArrayType(inds_i), ArrayType(inds_j)
     else
         or = offset_radii
         sor = scaled_offset_radii
@@ -563,12 +564,13 @@ function ImplicitSolventGBN2(atoms::AbstractArray{Atom{TY, M, T, D, E}},
         factor_solvent = zero(T(coulomb_const_units))
     end
 
-    if isa(atoms, CuArray)
-        or = CuArray(offset_radii)
-        sor = CuArray(scaled_offset_radii)
-        is, js = CuArray(inds_i), CuArray(inds_j)
-        d0s, m0s = CuArray(table_d0), CuArray(table_m0)
-        αs, βs, γs = CuArray(αs_cpu), CuArray(βs_cpu), CuArray(γs_cpu)
+    if isa(atoms, AbstractGPUArray)
+        ArrayType = fine_array_type(atoms)
+        or = ArrayType(offset_radii)
+        sor = ArrayType(scaled_offset_radii)
+        is, js = ArrayType(inds_i), ArrayType(inds_j)
+        d0s, m0s = ArrayType(table_d0), ArrayType(table_m0)
+        αs, βs, γs = ArrayType(αs_cpu), ArrayType(βs_cpu), ArrayType(γs_cpu)
     else
         or = offset_radii
         sor = scaled_offset_radii
@@ -694,7 +696,7 @@ function born_radii_and_grad(inter::ImplicitSolventOBC{T}, coords, boundary) whe
     return Bs, B_grads, I_grads
 end
 
-function born_radii_and_grad(inter::ImplicitSolventOBC, coords::CuArray, boundary)
+function born_radii_and_grad(inter::ImplicitSolventOBC, coords::AbstractGPUArray, boundary)
     coords_i = @view coords[inter.is]
     coords_j = @view coords[inter.js]
     loop_res = born_radii_loop_OBC.(coords_i, coords_j, inter.oris, inter.srjs,
@@ -766,7 +768,7 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords, boundary) wh
     return Bs, B_grads, I_grads
 end
 
-function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, boundary) where T
+function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::AbstractGPUArray, boundary) where T
     Is, I_grads = gbsa_born_gpu(coords, inter.offset_radii, inter.scaled_offset_radii,
                                 inter.dist_cutoff, inter.offset, inter.neck_scale,
                                 inter.neck_cut, inter.d0s, inter.m0s, boundary, Val(T))
@@ -778,42 +780,41 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, bou
     return Bs, B_grads, I_grads
 end
 
-function cuda_threads_blocks_gbsa(n_inters)
+function gpu_threads_blocks_gbsa(n_inters)
     n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_IMPLICIT", "512"))
-    n_blocks = cld(n_inters, n_threads_gpu)
-    return n_threads_gpu, n_blocks
+    return n_threads_gpu
 end
 
 function gbsa_born_gpu(coords::AbstractArray{SVector{D, C}}, offset_radii, scaled_offset_radii,
                        dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary,
                        ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
     n_atoms = length(coords)
-    Is_nounits = CUDA.zeros(T, n_atoms)
-    I_grads_nounits = CUDA.zeros(T, n_atoms, n_atoms)
+    Is_nounits = KernelAbstractions.zeros(backend, T, n_atoms)
+    I_grads_nounits = KernelAbstractions.zeros(backend, T, n_atoms, n_atoms)
     n_inters = n_atoms ^ 2
-    n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters)
+    n_threads_gpu = gpu_threads_blocks_gbsa(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_born_kernel!(
-                Is_nounits, I_grads_nounits, coords, offset_radii, scaled_offset_radii,
-                dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary, Val(C))
+    kernel! = gbsa_born_kernel!(backend, n_threads_gpu)
+    kernel!(Is_nounits, I_grads_nounits, coords, offset_radii,
+            scaled_offset_radii, dist_cutoff, offset, neck_scale,
+            neck_cut, d0s, m0s, boundary, Val(C), ndrange = n_inters)
 
     Is = Is_nounits * unit(dist_cutoff)^-1
     I_grads = I_grads_nounits * unit(dist_cutoff)^-2
     return Is, I_grads
 end
 
-function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_offset_radii_var,
-                           dist_cutoff, offset, neck_scale, neck_cut, d0s_var, m0s_var, boundary,
-                           ::Val{C}) where C
-    coords              = CUDA.Const(coords_var)
-    offset_radii        = CUDA.Const(offset_radii_var)
-    scaled_offset_radii = CUDA.Const(scaled_offset_radii_var)
-    d0s                 = CUDA.Const(d0s_var)
-    m0s                 = CUDA.Const(m0s_var)
+@kernel function gbsa_born_kernel!(Is, I_grads, @Const(coords),
+                                   @Const(offset_radii),
+                                   @Const(scaled_offset_radii),
+                                   dist_cutoff, offset, neck_scale, neck_cut,
+                                   @Const(d0s), @Const(m0s), boundary,
+                                   ::Val{C}) where C
 
     n_atoms = length(coords)
     n_inters = n_atoms ^ 2
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         i = cld(inter_i, n_atoms)
@@ -854,7 +855,6 @@ function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_off
             end
         end
     end
-    return nothing
 end
 
 function gb_force_loop_1(coord_i, coord_j, i, j, charge_i, charge_j, Bi, Bj, dist_cutoff,
@@ -965,16 +965,17 @@ end
 function gbsa_force_1_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff,
                           factor_solute, factor_solvent, kappa, Bs, atom_charges::AbstractArray{T},
                           force_units) where {D, C, T}
+    backend = get_backend(coords)
     n_atoms = length(coords)
-    fs_mat = CUDA.zeros(T, D, n_atoms)
-    born_forces_mod_ustrip = CUDA.zeros(T, n_atoms)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms)
+    born_forces_mod_ustrip = KernelAbstractions.zeros(backend, T, n_atoms)
     n_inters = n_atoms_to_n_pairs(n_atoms) + n_atoms
-    n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters)
+    n_threads_gpu = gpu_threads_blocks_gbsa(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_1_kernel!(
-                fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff,
-                factor_solute, factor_solvent, kappa, Bs, atom_charges,
-                Val(D), Val(force_units))
+    kernel! = gbsa_force_1_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff,
+            factor_solute, factor_solvent, kappa, Bs, atom_charges,
+            Val(D), Val(force_units), ndrange = n_inters)
 
     return fs_mat, born_forces_mod_ustrip
 end
@@ -982,29 +983,30 @@ end
 function gbsa_force_2_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff, Bs, B_grads,
                           I_grads, born_forces, offset_radii, scaled_offset_radii,
                           force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
     n_atoms = length(coords)
-    fs_mat = CUDA.zeros(T, D, n_atoms)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms)
     n_inters = n_atoms ^ 2
-    n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters)
+    n_threads_gpu = gpu_threads_blocks_gbsa(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_2_kernel!(
-                fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii,
-                scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units))
+    kernel! = gbsa_force_2_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii,
+            scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units),
+            ndrange = n_inters)
 
     return fs_mat
 end
 
-function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, boundary, dist_cutoff,
-                              factor_solute, factor_solvent, kappa, Bs_var, atom_charges_var,
-                              ::Val{D}, ::Val{F}) where {D, F}
-    coords  = CUDA.Const(coords_var)
-    Bs      = CUDA.Const(Bs_var)
-    atom_charges = CUDA.Const(atom_charges_var)
+@kernel function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip,
+                                      @Const(coords), boundary, dist_cutoff,
+                                      factor_solute, factor_solvent, kappa,
+                                      @Const(Bs), @Const(atom_charges),
+                                      ::Val{D}, ::Val{F}) where {D, F}
 
     n_atoms = length(coords)
     n_inters_not_self = n_atoms_to_n_pairs(n_atoms)
     n_inters = n_inters_not_self + n_atoms
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         if inter_i <= n_inters_not_self
@@ -1050,22 +1052,17 @@ function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, bounda
             end
         end
     end
-    return nothing
 end
 
-function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cutoff, or_var,
-                              sor_var, Bs_var, B_grads_var, I_grads_var, ::Val{D},
-                              ::Val{F}) where {D, F}
-    coords  = CUDA.Const(coords_var)
-    or      = CUDA.Const(or_var)
-    sor     = CUDA.Const(sor_var)
-    Bs      = CUDA.Const(Bs_var)
-    B_grads = CUDA.Const(B_grads_var)
-    I_grads = CUDA.Const(I_grads_var)
+@kernel function gbsa_force_2_kernel!(forces, born_forces, @Const(coords),
+                                      boundary, dist_cutoff, @Const(or),
+                                      @Const(sor), @Const(Bs),
+                                      @Const(B_grads), @Const(I_grads),
+                                      ::Val{D}, ::Val{F}) where {D, F}
 
     n_atoms = length(coords)
     n_inters = n_atoms ^ 2
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         i = cld(inter_i, n_atoms)
@@ -1098,7 +1095,6 @@ function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cu
             end
         end
     end
-    return nothing
 end
 
 function AtomsCalculators.forces(sys, inter::AbstractGBSA; kwargs...)
diff --git a/src/kernels.jl b/src/kernels.jl
new file mode 100644
index 00000000..274c71bd
--- /dev/null
+++ b/src/kernels.jl
@@ -0,0 +1,345 @@
+# KernelAbstractions.jl kernels
+
+function get_array_type(a::AT) where AT <: AbstractArray
+    return AT.name.wrapper
+end
+
+@inline function sum_pairwise_forces(inters, coord_i, coord_j, atom_i, atom_j,
+                                    boundary, special, ::Val{F}) where F
+    dr = vector(coord_i, coord_j, boundary)
+    f_tuple = ntuple(length(inters)) do inter_type_i
+        force_gpu(inters[inter_type_i], dr, coord_i, coord_j, atom_i, atom_j, boundary, special)
+    end
+    f = sum(f_tuple)
+    if unit(f[1]) != F
+        # This triggers an error but it isn't printed
+        # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692
+        #   for how to throw a more meaningful error
+        error("wrong force unit returned, was expecting $F but got $(unit(f[1]))")
+    end
+    return f
+end
+
+function gpu_threads_blocks_pairwise(n_neighbors)
+    n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
+    return n_threads_gpu
+end
+
+function gpu_threads_blocks_specific(n_inters)
+    n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_SPECIFIC", "128"))
+    return n_threads_gpu
+end
+
+function pairwise_force_gpu(coords::AbstractArray{SVector{D, C}}, atoms, boundary,
+                            pairwise_inters, nbs, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, length(atoms))
+    n_threads_gpu = gpu_threads_blocks_pairwise(length(nbs))
+    kernel! = pairwise_force_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, atoms, boundary, pairwise_inters, nbs,
+            Val(D), Val(force_units), ndrange = length(nbs))
+    return fs_mat
+end
+
+@kernel function pairwise_force_kernel!(forces, @Const(coords),
+                                        @Const(atoms), boundary, inters,
+                                        @Const(neighbors), ::Val{D},
+                                        ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(neighbors)
+        i, j, special = neighbors[inter_i]
+        f = sum_pairwise_forces(inters, coords[i], coords[j], atoms[i], atoms[j], boundary, special, Val(F))
+        for dim in 1:D
+            fval = ustrip(f[dim])
+            Atomix.@atomic forces[dim, i] = forces[dim, i] - fval
+            Atomix.@atomic forces[dim, j] = forces[dim, j] + fval
+        end
+    end
+end
+
+function specific_force_gpu(inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
+                            boundary, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, length(coords))
+    n_threads_gpu = gpu_threads_blocks_specific(length(inter_list))
+    kernel! = specific_force_1_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, boundary, inter_list.is, inter_list.inters,
+            Val(D), Val(force_units), ndrange = length(inter_list))
+    return fs_mat
+end
+
+function specific_force_gpu(inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
+                            boundary, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, length(coords))
+    n_threads_gpu = gpu_threads_blocks_specific(length(inter_list))
+    kernel! = specific_force_2_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, boundary, inter_list.is, inter_list.js,
+            inter_list.inters, Val(D), Val(force_units),
+            ndrange = length(inter_list))
+    return fs_mat
+end
+
+function specific_force_gpu(inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
+                            boundary, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, length(coords))
+    n_threads_gpu = gpu_threads_blocks_specific(length(inter_list))
+    kernel! = specific_force_3_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, boundary, inter_list.is, inter_list.js,
+            inter_list.ks, inter_list.inters, Val(D), Val(force_units),
+            ndrange = length(inter_list))
+    return fs_mat
+end
+
+function specific_force_gpu(inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
+                            boundary, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, length(coords))
+    n_threads_gpu = gpu_threads_blocks_specific(length(inter_list))
+    kernel! = specific_force_4_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, boundary, inter_list.is, inter_list.js,
+            inter_list.ks, inter_list.ls, inter_list.inters, Val(D),
+            Val(force_units), ndrange = length(inter_list))
+    return fs_mat
+end
+
+@kernel function specific_force_1_atoms_kernel!(forces, @Const(coords),
+                                                boundary, @Const(is),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i = is[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], boundary)
+        if unit(fs.f1[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
+        end
+    end
+end
+
+@kernel function specific_force_2_atoms_kernel!(forces, @Const(coords),
+                                                boundary, @Const(is),
+                                                @Const(js),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j = is[inter_i], js[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
+        end
+    end
+end
+
+@kernel function specific_force_3_atoms_kernel!(forces, @Const(coords),
+                                                boundary, @Const(is),
+                                                @Const(js), @Const(ks),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
+            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
+        end
+    end
+end
+
+@kernel function specific_force_4_atoms_kernel!(forces, @Const(coords),
+                                                boundary, @Const(is),
+                                                @Const(js), @Const(ks),
+                                                @Const(ls),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
+            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
+            Atomix.@atomic :monotonic forces[dim, l] += ustrip(fs.f4[dim])
+        end
+    end
+end
+
+function pairwise_pe_gpu(coords::AbstractArray{SVector{D, C}}, atoms, boundary,
+                         pairwise_inters, nbs, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    pe_vec = KernelAbstractions.zeros(backend, T, 1)
+    n_threads_gpu = gpu_threads_blocks_pairwise(length(nbs))
+    kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec, coords, atoms, boundary, pairwise_inters, nbs,
+            Val(energy_units), ndrange = length(nbs))
+    return pe_vec
+end
+
+@kernel function pairwise_pe_kernel!(energy, @Const(coords),
+                                     @Const(atoms), boundary, inters,
+                                     @Const(neighbors), ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(neighbors)
+        i, j, special = neighbors[inter_i]
+        coord_i, coord_j = coords[i], coords[j]
+        dr = vector(coord_i, coord_j, boundary)
+        pe = potential_energy_gpu(inters[1], dr, coord_i, coord_j, atoms[i], atoms[j],
+                                  boundary, special)
+        for inter in inters[2:end]
+            pe += potential_energy_gpu(inter, dr, coord_i, coord_j, atoms[i], atoms[j],
+                                       boundary, special)
+        end
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+end
+
+function specific_pe_gpu(inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
+                         boundary, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    pe_vec = KernelAbstractions.zeros(backend, T, 1)
+    n_threads_gpu = gpu_threads_blocks_specific(length(inter_list))
+    kernel! = specific_pe_1_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec, coords, boundary, inter_list.is, inter_list.inters,
+            Val(energy_units), ndrange = length(inter_list))
+    return pe_vec
+end
+
+function specific_pe_gpu(inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
+                         boundary, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    pe_vec = KernelAbstractions.zeros(backend, T, 1)
+    n_threads_gpu = gpu_threads_blocks_specific(length(inter_list))
+    kernel! = specific_pe_2_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec, coords, boundary, inter_list.is, inter_list.js,
+            inter_list.inters, Val(energy_units), ndrange = length(inter_list))
+    return pe_vec
+end
+
+function specific_pe_gpu(inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
+                         boundary, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    pe_vec = KernelAbstractions.zeros(backend, T, 1)
+    n_threads_gpu = gpu_threads_blocks_specific(length(inter_list))
+    kernel! = specific_pe_3_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec, coords, boundary, inter_list.is, inter_list.js,
+            inter_list.ks, inter_list.inters, Val(energy_units),
+            ndrange = length(inter_list))
+    return pe_vec
+end
+
+function specific_pe_gpu(inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
+                         boundary, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    pe_vec = KernelAbstractions.zeros(backend, T, 1)
+    n_threads_gpu = gpu_threads_blocks_specific(length(inter_list))
+    kernel! = specific_pe_4_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec, coords, boundary, inter_list.is, inter_list.js,
+            inter_list.ks, inter_list.ls, inter_list.inters,
+            Val(energy_units), ndrange = length(inter_list))
+    return pe_vec
+end
+
+@kernel function specific_pe_1_atoms_kernel!(energy, @Const(coords),
+                                             boundary, @Const(is),
+                                             @Const(inters),
+                                             ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i = is[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], boundary)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+end
+
+@kernel function specific_pe_2_atoms_kernel!(energy, @Const(coords),
+                                             boundary, @Const(is),
+                                             @Const(js),
+                                             @Const(inters),
+                                             ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j = is[inter_i], js[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+end
+
+@kernel function specific_pe_3_atoms_kernel!(energy, @Const(coords),
+                                             boundary, @Const(is),
+                                             @Const(js), @Const(ks),
+                                             @Const(inters),
+                                             ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+end
+
+@kernel function specific_pe_4_atoms_kernel!(energy, @Const(coords),
+                                             boundary, @Const(is),
+                                             @Const(js), @Const(ks),
+                                             @Const(ls),
+                                             @Const(inters),
+                                             ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+end
diff --git a/src/neighbors.jl b/src/neighbors.jl
index 5289496c..e238fff8 100644
--- a/src/neighbors.jl
+++ b/src/neighbors.jl
@@ -64,12 +64,12 @@ function DistanceNeighborFinder(;
                 eligible, dist_cutoff, special, n_steps, zero(eligible))
 end
 
-function find_neighbors(sys::System{D, false},
+function find_neighbors(sys::System{D, AT},
                         nf::DistanceNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        n_threads::Integer=Threads.nthreads()) where D
+                        n_threads::Integer=Threads.nthreads()) where {D, AT}
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
@@ -92,20 +92,19 @@ function find_neighbors(sys::System{D, false},
     return NeighborList(length(neighbors_list), neighbors_list)
 end
 
-function cuda_threads_blocks_dnf(n_inters)
+function gpu_threads_blocks_dnf(n_inters)
     n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_DISTANCENF", "512"))
-    n_blocks = cld(n_inters, n_threads_gpu)
-    return n_threads_gpu, n_blocks
+    return n_threads_gpu
 end
 
-function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var,
-                                          boundary, sq_dist_neighbors)
-    coords    = CUDA.Const(coords_var)
-    eligible = CUDA.Const(eligible_var)
+@kernel function distance_neighbor_finder_kernel!(neighbors,
+                                                  @Const(coords),
+                                                  @Const(eligible),
+                                                  boundary, sq_dist_neighbors)
 
     n_atoms = length(coords)
     n_inters = n_atoms_to_n_pairs(n_atoms)
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         i, j = pair_index(n_atoms, inter_i)
@@ -117,28 +116,28 @@ function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var,
             end
         end
     end
-    return nothing
 end
 
 lists_to_tuple_list(i, j, w) = (Int32(i), Int32(j), w)
 
-function find_neighbors(sys::System{D, true},
+function find_neighbors(sys::System{D, AT},
                         nf::DistanceNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        kwargs...) where D
+                        kwargs...) where {D, AT <: AbstractGPUArray}
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
 
     nf.neighbors .= false
     n_inters = n_atoms_to_n_pairs(length(sys))
-    n_threads_gpu, n_blocks = cuda_threads_blocks_dnf(n_inters)
+    n_threads_gpu = gpu_threads_blocks_dnf(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks distance_neighbor_finder_kernel!(
-        nf.neighbors, sys.coords, nf.eligible, sys.boundary, nf.dist_cutoff^2,
-    )
+    backend = get_backend(sys.coords)
+    kernel! = distance_neighbor_finder_kernel!(backend, n_threads_gpu)
+    kernel!(nf.neighbors, sys.coords, nf.eligible, sys.boundary,
+            nf.dist_cutoff^2, ndrange = n_inters)
 
     pairs = findall(nf.neighbors)
     nbsi, nbsj = getindex.(pairs, 1), getindex.(pairs, 2)
@@ -307,19 +306,19 @@ function reduce_pairs(neighbors::NeighborList, neighbors_threaded::Vector{Neighb
     return neighbors
 end
 
-function find_neighbors(sys::System{D, G},
+function find_neighbors(sys::System{D, AT},
                         nf::CellListMapNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        n_threads::Integer=Threads.nthreads()) where {D, G}
+                        n_threads::Integer=Threads.nthreads()) where {D, AT}
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
 
     if isnothing(current_neighbors)
         neighbors = NeighborList()
-    elseif G
+    elseif AT <: AbstractGPUArray
         neighbors = NeighborList(current_neighbors.n, Array(current_neighbors.list))
     else
         neighbors = current_neighbors
@@ -351,8 +350,8 @@ function find_neighbors(sys::System{D, G},
     )
 
     nf.cl = cl
-    if G
-        return NeighborList(neighbors.n, CuArray(neighbors.list))
+    if AT <: AbstractGPUArray
+        return NeighborList(neighbors.n, AT(neighbors.list))
     else
         return neighbors
     end
diff --git a/src/setup.jl b/src/setup.jl
index 978ad31a..7a519b5b 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -428,8 +428,8 @@ are not available when reading Gromacs files.
 - `loggers=()`: the loggers that record properties of interest during a
     simulation.
 - `units::Bool=true`: whether to use Unitful quantities.
-- `gpu::Bool=false`: whether to move the relevant parts of the system onto
-    the GPU.
+- `ArrayType::AbstractArray = Array`: The ArrayType desired for the simulation
+   (for GPU support, use CuArray or ROCArray)
 - `dist_cutoff=1.0u"nm"`: cutoff distance for long-range interactions.
 - `dist_neighbors=1.2u"nm"`: cutoff distance for the neighbor list, should be
     greater than `dist_cutoff`.
@@ -452,7 +452,7 @@ function System(coord_file::AbstractString,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                gpu::Bool=false,
+                ArrayType::Type{AT} where AT <: AbstractArray = Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
@@ -824,9 +824,9 @@ function System(coord_file::AbstractString,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            gpu ? CuArray(bonds.is) : bonds.is,
-            gpu ? CuArray(bonds.js) : bonds.js,
-            gpu ? CuArray([bonds.inters...]) : [bonds.inters...],
+            ArrayType(bonds.is),
+            ArrayType(bonds.js),
+            ArrayType([bonds.inters...]),
             bonds.types,
         ))
         topology = MolecularTopology(bonds.is, bonds.js, n_atoms)
@@ -835,30 +835,30 @@ function System(coord_file::AbstractString,
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
-            gpu ? CuArray(angles.is) : angles.is,
-            gpu ? CuArray(angles.js) : angles.js,
-            gpu ? CuArray(angles.ks) : angles.ks,
-            gpu ? CuArray([angles.inters...]) : [angles.inters...],
+            ArrayType(angles.is),
+            ArrayType(angles.js),
+            ArrayType(angles.ks),
+            ArrayType([angles.inters...]),
             angles.types,
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            gpu ? CuArray(torsions.is) : torsions.is,
-            gpu ? CuArray(torsions.js) : torsions.js,
-            gpu ? CuArray(torsions.ks) : torsions.ks,
-            gpu ? CuArray(torsions.ls) : torsions.ls,
-            gpu ? CuArray(torsion_inters_pad) : torsion_inters_pad,
+            ArrayType(torsions.is),
+            ArrayType(torsions.js),
+            ArrayType(torsions.ks),
+            ArrayType(torsions.ls),
+            ArrayType(torsion_inters_pad),
             torsions.types,
         ))
     end
     if length(impropers.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            gpu ? CuArray(impropers.is) : impropers.is,
-            gpu ? CuArray(impropers.js) : impropers.js,
-            gpu ? CuArray(impropers.ks) : impropers.ks,
-            gpu ? CuArray(impropers.ls) : impropers.ls,
-            gpu ? CuArray(improper_inters_pad) : improper_inters_pad,
+            ArrayType(impropers.is),
+            ArrayType(impropers.js),
+            ArrayType(impropers.ks),
+            ArrayType(impropers.ls),
+            ArrayType(improper_inters_pad),
             impropers.types,
         ))
     end
@@ -887,10 +887,10 @@ function System(coord_file::AbstractString,
     end
     coords = wrap_coords.(coords, (boundary_used,))
 
-    if gpu || !use_cell_list
+    if (ArrayType <: AbstractGPUArray) || !use_cell_list
         neighbor_finder = DistanceNeighborFinder(
-            eligible=(gpu ? CuArray(eligible) : eligible),
-            special=(gpu ? CuArray(special) : special),
+            eligible=(ArrayType(eligible)),
+            special=(ArrayType(special)),
             n_steps=10,
             dist_cutoff=T(dist_neighbors),
         )
@@ -904,13 +904,9 @@ function System(coord_file::AbstractString,
             dist_cutoff=T(dist_neighbors),
         )
     end
-    if gpu
-        atoms = CuArray([atoms_abst...])
-        coords_dev = CuArray(coords)
-    else
-        atoms = [atoms_abst...]
-        coords_dev = coords
-    end
+
+    atoms = ArrayType(atoms)
+    coords = ArrayType(coords)
 
     if isnothing(velocities)
         if units
@@ -965,7 +961,7 @@ function System(T::Type,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                gpu::Bool=false,
+                ArrayType::Type{AT} where AT <: AbstractArray = Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
@@ -1246,9 +1242,9 @@ function System(T::Type,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            gpu ? CuArray(bonds.is) : bonds.is,
-            gpu ? CuArray(bonds.js) : bonds.js,
-            gpu ? CuArray([bonds.inters...]) : [bonds.inters...],
+            ArrayType(bonds.is),
+            ArrayType(bonds.js),
+            ArrayType([bonds.inters...]),
             bonds.types,
         ))
         topology = MolecularTopology(bonds.is, bonds.js, n_atoms)
@@ -1257,29 +1253,29 @@ function System(T::Type,
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
-            gpu ? CuArray(angles.is) : angles.is,
-            gpu ? CuArray(angles.js) : angles.js,
-            gpu ? CuArray(angles.ks) : angles.ks,
-            gpu ? CuArray([angles.inters...]) : [angles.inters...],
+            ArrayType(angles.is),
+            ArrayType(angles.js),
+            ArrayType(angles.ks),
+            ArrayType([angles.inters...]),
             angles.types,
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            gpu ? CuArray(torsions.is) : torsions.is,
-            gpu ? CuArray(torsions.js) : torsions.js,
-            gpu ? CuArray(torsions.ks) : torsions.ks,
-            gpu ? CuArray(torsions.ls) : torsions.ls,
-            gpu ? CuArray([torsions.inters...]) : [torsions.inters...],
+            ArrayType(torsions.is),
+            ArrayType(torsions.js),
+            ArrayType(torsions.ks),
+            ArrayType(torsions.ls),
+            ArrayType([torsions.inters...]),
             torsions.types,
         ))
     end
     specific_inter_lists = tuple(specific_inter_array...)
 
-    if gpu || !use_cell_list
+    if ArrayType <: AbstractGPUArray || !use_cell_list
         neighbor_finder = DistanceNeighborFinder(
-            eligible=(gpu ? CuArray(eligible) : eligible),
-            special=(gpu ? CuArray(special) : special),
+            eligible=(ArrayType(eligible)),
+            special=(ArrayType(special)),
             n_steps=10,
             dist_cutoff=T(dist_neighbors),
         )
@@ -1293,13 +1289,9 @@ function System(T::Type,
             dist_cutoff=T(dist_neighbors),
         )
     end
-    if gpu
-        atoms = CuArray([atoms_abst...])
-        coords_dev = CuArray(coords)
-    else
-        atoms = [atoms_abst...]
-        coords_dev = coords
-    end
+
+    atoms = ArrayType(atoms)
+    coords = ArrayType(coords)
 
     if isnothing(velocities)
         if units
diff --git a/src/simulators.jl b/src/simulators.jl
index eb1e9e8b..8eea8973 100644
--- a/src/simulators.jl
+++ b/src/simulators.jl
@@ -818,12 +818,12 @@ Attempt an exchange of replicas `n` and `m` in a [`ReplicaSystem`](@ref) during
 Successful exchanges should exchange coordinates and velocities as appropriate.
 Returns acceptance quantity `Δ` and a `Bool` indicating whether the exchange was successful.
 """
-function remd_exchange!(sys::ReplicaSystem{D, G, T},
+function remd_exchange!(sys::ReplicaSystem{D, AT, T},
                         sim::TemperatureREMD,
                         n::Integer,
                         m::Integer;
                         n_threads::Integer=Threads.nthreads(),
-                        rng=Random.default_rng()) where {D, G, T}
+                        rng=Random.default_rng()) where {D, AT, T}
     T_n, T_m = sim.temperatures[n], sim.temperatures[m]
     β_n, β_m = inv(sys.k * T_n), inv(sys.k * T_m)
     neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder;
@@ -909,12 +909,12 @@ function simulate!(sys::ReplicaSystem,
     return simulate_remd!(sys, sim, n_steps; rng=rng, n_threads=n_threads, run_loggers=run_loggers)
 end
 
-function remd_exchange!(sys::ReplicaSystem{D, G, T},
+function remd_exchange!(sys::ReplicaSystem{D, AT, T},
                         sim::HamiltonianREMD,
                         n::Integer,
                         m::Integer;
                         n_threads::Integer=Threads.nthreads(),
-                        rng=Random.default_rng()) where {D, G, T}
+                        rng=Random.default_rng()) where {D, AT, T}
     T_sim = sim.temperature
     β_sim = inv(sys.k * T_sim)
     neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder;
@@ -1034,11 +1034,11 @@ function MetropolisMonteCarlo(; temperature, trial_moves, trial_args=Dict())
     return MetropolisMonteCarlo(temperature, trial_moves, trial_args)
 end
 
-@inline function simulate!(sys::System{D, G, T},
+@inline function simulate!(sys::System{D, AT, T},
                            sim::MetropolisMonteCarlo,
                            n_steps::Integer;
                            n_threads::Integer=Threads.nthreads(),
-                           run_loggers=true) where {D, G, T}
+                           run_loggers=true) where {D, AT, T}
     neighbors = find_neighbors(sys, sys.neighbor_finder; n_threads=n_threads)
     E_old = potential_energy(sys, neighbors; n_threads=n_threads)
     coords_old = similar(sys.coords)
@@ -1075,8 +1075,8 @@ Performs a random translation of the coordinates of a randomly selected atom in
 The translation is generated using a uniformly selected direction and uniformly selected length
 in range [0, 1) scaled by `shift_size` which should have appropriate length units.
 """
-function random_uniform_translation!(sys::System{D, G, T};
-                                     shift_size=oneunit(eltype(eltype(sys.coords)))) where {D, G, T}
+function random_uniform_translation!(sys::System{D, AT, T};
+                                     shift_size=oneunit(eltype(eltype(sys.coords)))) where {D, AT, T}
     rand_idx = rand(eachindex(sys))
     direction = random_unit_vector(T, D)
     magnitude = rand(T) * shift_size
@@ -1093,8 +1093,8 @@ The translation is generated using a uniformly chosen direction and length selec
 the standard normal distribution i.e. with mean 0 and standard deviation 1, scaled by `shift_size`
 which should have appropriate length units.
 """
-function random_normal_translation!(sys::System{D, G, T};
-                                    shift_size=oneunit(eltype(eltype(sys.coords)))) where {D, G, T}
+function random_normal_translation!(sys::System{D, AT, T};
+                                    shift_size=oneunit(eltype(eltype(sys.coords)))) where {D, AT, T}
     rand_idx = rand(eachindex(sys))
     direction = random_unit_vector(T, D)
     magnitude = randn(T) * shift_size
diff --git a/src/spatial.jl b/src/spatial.jl
index de0941b1..07502a9f 100644
--- a/src/spatial.jl
+++ b/src/spatial.jl
@@ -614,11 +614,11 @@ function random_velocities(sys::AtomsBase.AbstractSystem{2}, temp; rng=Random.de
 end
 
 function random_velocities(sys::System{3, true}, temp; rng=Random.default_rng())
-    return CuArray(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng))
+    return AT(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng))
 end
 
 function random_velocities(sys::System{2, true}, temp; rng=Random.default_rng())
-    return CuArray(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng))
+    return AT(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng))
 end
 
 """
@@ -738,9 +738,9 @@ function virial(sys, neighbors, step_n::Integer=0; n_threads::Integer=Threads.nt
     return v
 end
 
-function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_nonl,
-                            pairwise_inters_nl) where {D, G, T}
-    if G
+function virial(sys::System{D, AT, T}, neighbors_dev, step_n, pairwise_inters_nonl,
+                            pairwise_inters_nl) where {D, AT, T}
+    if AT <: AbstractGPUArray
         coords, velocities, atoms = Array(sys.coords), Array(sys.velocities), Array(sys.atoms)
         if isnothing(neighbors_dev)
             neighbors = neighbors_dev
@@ -792,7 +792,7 @@ function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_non
 end
 
 # Default for general interactions
-function virial(inter, sys::System{D, G, T}, args...; kwargs...) where {D, G, T}
+function virial(inter, sys::System{D, AT, T}, args...; kwargs...) where {D, AT, T}
     return zero(T) * sys.energy_units
 end
 
@@ -874,8 +874,9 @@ function molecule_centers(coords::AbstractArray{SVector{D, C}}, boundary, topolo
     end
 end
 
-function molecule_centers(coords::CuArray, boundary, topology)
-    return CuArray(molecule_centers(Array(coords), boundary, topology))
+function molecule_centers(coords::AbstractGPUArray, boundary, topology)
+    ArrayType = get_array_type(coords)
+    return ArrayType(molecule_centers(Array(coords), boundary, topology))
 end
 
 # Allows scaling multiple vectors at once by broadcasting this function
diff --git a/src/types.jl b/src/types.jl
index 225d9cff..977fbe7b 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -431,8 +431,6 @@ Base.firstindex(::NoNeighborList) = 1
 Base.lastindex(nl::NoNeighborList) = length(nl)
 Base.eachindex(nl::NoNeighborList) = Base.OneTo(length(nl))
 
-CUDA.Const(nl::NoNeighborList) = nl
-
 """
     System(; <keyword arguments>)
 
@@ -481,8 +479,8 @@ interface described there.
     modified in some simulations. `k` is chosen based on the `energy_units` given.
 - `data::DA=nothing`: arbitrary data associated with the system.
 """
-mutable struct System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF,
-                      L, F, E, K, M, DA} <: AtomsBase.AbstractSystem{D}
+mutable struct System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF,
+                      L, F, E, K, M, DA} <: AbstractSystem{D}
     atoms::A
     coords::C
     boundary::B
@@ -521,7 +519,7 @@ function System(;
                 k=default_k(energy_units),
                 data=nothing)
     D = AtomsBase.n_dimensions(boundary)
-    G = isa(coords, CuArray)
+    AT = get_array_type(coords)
     T = float_type(boundary)
     A = typeof(atoms)
     C = typeof(coords)
@@ -567,19 +565,19 @@ function System(;
         end
     end
 
-    if isa(atoms, CuArray) && !isa(coords, CuArray)
+    if isa(atoms, AbstractGPUArray) && !isa(coords, AbstractGPUArray)
         throw(ArgumentError("the atoms are on the GPU but the coordinates are not"))
     end
-    if isa(coords, CuArray) && !isa(atoms, CuArray)
+    if isa(coords, AbstractGPUArray) && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the coordinates are on the GPU but the atoms are not"))
     end
-    if isa(atoms, CuArray) && !isa(vels, CuArray)
+    if isa(atoms, AbstractGPUArray) && !isa(vels, AbstractGPUArray)
         throw(ArgumentError("the atoms are on the GPU but the velocities are not"))
     end
-    if isa(vels, CuArray) && !isa(atoms, CuArray)
+    if isa(vels, AbstractGPUArray) && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the velocities are on the GPU but the atoms are not"))
     end
-    if isa(atoms, CuArray) && length(constraints) > 0
+    if isa(atoms, AbstractGPUArray) && length(constraints) > 0
         @warn "Constraints are not currently compatible with simulation on the GPU"
     end
 
@@ -596,7 +594,7 @@ function System(;
     check_units(atoms, coords, vels, energy_units, force_units, pairwise_inters,
                 specific_inter_lists, general_inters, boundary)
 
-    return System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}(
+    return System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}(
                     atoms, coords, boundary, vels, atoms_data, topology, pairwise_inters,
                     specific_inter_lists, general_inters, constraints, neighbor_finder, loggers,
                     df, force_units, energy_units, k_converted, atom_masses, data)
@@ -847,7 +845,7 @@ construction where `n` is the number of threads to be used per replica.
     modified in some simulations. `k` is chosen based on the `energy_units` given.
 - `data::DA=nothing`: arbitrary data associated with the replica system.
 """
-mutable struct ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA} <: AtomsBase.AbstractSystem{D}
+mutable struct ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA} <: AbstractSystem{D}
     atoms::A
     n_replicas::Int
     atoms_data::AD
@@ -884,7 +882,8 @@ function ReplicaSystem(;
                         k=default_k(energy_units),
                         data=nothing)
     D = AtomsBase.n_dimensions(boundary)
-    G = isa(replica_coords[1], CuArray)
+    D = n_dimensions(boundary)
+    AT = get_array_type(replica_coords[1])
     T = float_type(boundary)
     A = typeof(atoms)
     AD = typeof(atoms_data)
@@ -995,25 +994,25 @@ function ReplicaSystem(;
         throw(ArgumentError("there are $(length(atoms)) atoms but $(length(atoms_data)) atom data entries"))
     end
 
-    n_cuarray = sum(y -> isa(y, CuArray), replica_coords)
+    n_cuarray = sum(y -> isa(y, AbstractGPUArray), replica_coords)
     if !(n_cuarray == n_replicas || n_cuarray == 0)
         throw(ArgumentError("the coordinates for $n_cuarray out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, CuArray) && n_cuarray != n_replicas
+    if isa(atoms, AbstractGPUArray) && n_cuarray != n_replicas
         throw(ArgumentError("the atoms are on the GPU but the coordinates are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, CuArray)
+    if n_cuarray == n_replicas && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the coordinates are on the GPU but the atoms are not"))
     end
 
-    n_cuarray = sum(y -> isa(y, CuArray), replica_velocities)
+    n_cuarray = sum(y -> isa(y, AbstractGPUArray), replica_velocities)
     if !(n_cuarray == n_replicas || n_cuarray == 0)
         throw(ArgumentError("the velocities for $n_cuarray out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, CuArray) && n_cuarray != n_replicas
+    if isa(atoms, AbstractGPUArray) && n_cuarray != n_replicas
         throw(ArgumentError("the atoms are on the GPU but the velocities are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, CuArray)
+    if n_cuarray == n_replicas && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the velocities are on the GPU but the atoms are not"))
     end
 
@@ -1023,7 +1022,7 @@ function ReplicaSystem(;
     k_converted = convert_k_units(T, k, energy_units)
     K = typeof(k_converted)
 
-    replicas = Tuple(System{D, G, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]),
+    replicas = Tuple(System{D, AT, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]),
                         typeof(replica_specific_inter_lists[i]), typeof(replica_general_inters[i]),
                         typeof(replica_constraints[i]), NF, typeof(replica_loggers[i]), F, E, K,
                         M, Nothing}(
@@ -1034,7 +1033,7 @@ function ReplicaSystem(;
             force_units, energy_units, k_converted, atom_masses, nothing) for i in 1:n_replicas)
     R = typeof(replicas)
 
-    return ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA}(
+    return ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA}(
             atoms, n_replicas, atoms_data, exchange_logger, force_units,
             energy_units, k_converted, replicas, data)
 end
@@ -1044,7 +1043,7 @@ end
 
 Whether a [`System`](@ref) or [`ReplicaSystem`](@ref) is on the GPU.
 """
-is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G
+is_on_gpu(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT <: AbstractGPUArray
 
 """
     float_type(sys)
@@ -1052,7 +1051,7 @@ is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G
 
 The float type a [`System`](@ref), [`ReplicaSystem`](@ref) or bounding box uses.
 """
-float_type(::Union{System{D, G, T}, ReplicaSystem{D, G, T}}) where {D, G, T} = T
+float_type(::Union{System{D, AT, T}, ReplicaSystem{D, AT, T}}) where {D, AT, T} = T
 
 """
     masses(sys)
@@ -1071,8 +1070,7 @@ charges(s::Union{System, ReplicaSystem}) = charge.(s.atoms)
 charge(s::Union{System, ReplicaSystem}, i::Integer) = charge(s.atoms[i])
 
 # Move an array to the GPU depending on whether the system is on the GPU
-move_array(arr, ::System{D, false}) where {D} = arr
-move_array(arr, ::System{D, true }) where {D} = CuArray(arr)
+move_array(arr, ::System{D, AT}) where {D, AT} = AT(arr)
 
 Base.getindex(s::Union{System, ReplicaSystem}, i::Union{Integer, AbstractVector}) = s.atoms[i]
 Base.length(s::Union{System, ReplicaSystem}) = length(s.atoms)
diff --git a/test/Project.toml b/test/Project.toml
index 3901cc98..e7e9a459 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsBaseTesting = "ed7c10db-df7e-4efa-a7be-4f4190f7f227"
diff --git a/test/basic.jl b/test/basic.jl
index e549fae7..686de36b 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -176,16 +176,16 @@
     @test mcs == [SVector(0.05, 0.0), SVector(1.0, 1.0)]
 
     ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "tip3p_standard.xml", "his.xml"])...)
-    for gpu in gpu_list
-        sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=gpu, use_cell_list=false)
+    for ArrayType in array_list
+        sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; ArrayType=ArrayType, use_cell_list=false)
         mcs = molecule_centers(sys.coords, sys.boundary, sys.topology)
-        @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.04u"nm")
+        @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.08u"nm")
 
         # Mark all pairs as ineligible for pairwise interactions and check that the
         #   potential energy from the specific interactions does not change on scaling
         no_nbs = falses(length(sys), length(sys))
         sys.neighbor_finder = DistanceNeighborFinder(
-            eligible=(gpu ? CuArray(no_nbs) : no_nbs),
+            eligible=(ArrayType(no_nbs)),
             dist_cutoff=1.0u"nm",
         )
         coords_start = copy(sys.coords)
@@ -310,8 +310,9 @@ end
         end
     end
 
-    if run_gpu_tests
-        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=true)
+    if run_cuda_tests
+        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff;
+                         ArrayType=CuArray)
         for neighbor_finder in (DistanceNeighborFinder,)
             nf_gpu = neighbor_finder(
                 eligible=sys_gpu.neighbor_finder.eligible,
@@ -320,7 +321,25 @@ end
             )
             neighbors_gpu = find_neighbors(sys_gpu, nf_gpu)
             @test length(neighbors_gpu) == n_neighbors_ref
-            CUDA.allowscalar() do
+            GPUArrays.allowscalar() do
+                @test neighbors_gpu[10] isa Tuple{Int32, Int32, Bool}
+            end
+            @test identical_neighbors(neighbors_gpu, neighbors_ref)
+        end
+    end
+
+    if run_rocm_tests
+        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff;
+                         ArrayType=ROCArray)
+        for neighbor_finder in (DistanceNeighborFinder,)
+            nf_gpu = neighbor_finder(
+                eligible=sys_gpu.neighbor_finder.eligible,
+                special=sys_gpu.neighbor_finder.special,
+                dist_cutoff=dist_cutoff,
+            )
+            neighbors_gpu = find_neighbors(sys_gpu, nf_gpu)
+            @test length(neighbors_gpu) == n_neighbors_ref
+            GPUArrays.allowscalar() do
                 @test neighbors_gpu[10] isa Tuple{Int32, Int32, Bool}
             end
             @test identical_neighbors(neighbors_gpu, neighbors_ref)
@@ -336,9 +355,13 @@ end
     coords_1 = SVector{3, Float64}.(eachcol(cm_1)) / 10 * u"nm"
     coords_2 = SVector{3, Float64}.(eachcol(cm_2)) / 10 * u"nm"
     @test rmsd(coords_1, coords_2) ≈ 2.54859467758795u"Å"
-    if run_gpu_tests
+    if run_cuda_tests
         @test rmsd(CuArray(coords_1), CuArray(coords_2)) ≈ 2.54859467758795u"Å"
     end
+    if run_rocm_tests
+        @test rmsd(ROCArray(coords_1),
+                   ROCArray(coords_2)) ≈ 2.54859467758795u"Å"
+    end
 
     bb_atoms = BioStructures.collectatoms(struc[1], BioStructures.backboneselector)
     coords = SVector{3, Float64}.(eachcol(BioStructures.coordarray(bb_atoms))) / 10 * u"nm"
diff --git a/test/energy_conservation.jl b/test/energy_conservation.jl
index d64a4cf2..8ae21aad 100644
--- a/test/energy_conservation.jl
+++ b/test/energy_conservation.jl
@@ -6,7 +6,7 @@ using CUDA
 using Test
 
 @testset "Lennard-Jones energy conservation" begin
-    function test_energy_conservation(gpu::Bool, n_threads::Integer, n_steps::Integer)
+    function test_energy_conservation(ArrayType::AbstractArray, n_threads::Integer, n_steps::Integer)
         n_atoms = 2_000
         atom_mass = 40.0u"g/mol"
         temp = 1.0u"K"
@@ -26,8 +26,8 @@ using Test
             coords = place_atoms(n_atoms, boundary; min_dist=0.6u"nm")
 
             sys = System(
-                atoms=(gpu ? CuArray(atoms) : atoms),
-                coords=(gpu ? CuArray(coords) : coords),
+                atoms=(ArrayType(atoms) : atoms),
+                coords=(ArrayType(coords) : coords),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(cutoff=cutoff, use_neighbors=false),),
                 loggers=(
@@ -56,11 +56,11 @@ using Test
         end
     end
 
-    test_energy_conservation(false, 1, 10_000)
+    test_energy_conservation(Array, 1, 10_000)
     if Threads.nthreads() > 1
-        test_energy_conservation(false, Threads.nthreads(), 50_000)
+        test_energy_conservation(Array, Threads.nthreads(), 50_000)
     end
     if CUDA.functional()
-        test_energy_conservation(true, 1, 100_000)
+        test_energy_conservation(CuArray, 1, 100_000)
     end
 end
diff --git a/test/minimization.jl b/test/minimization.jl
index 83a10f0e..253c5e8b 100644
--- a/test/minimization.jl
+++ b/test/minimization.jl
@@ -42,14 +42,14 @@
     @test isapprox(potential_energy(sys; n_threads=1) * u"kJ * mol^-1", -3.0u"kJ * mol^-1";
                     atol=1e-4u"kJ * mol^-1")
 
-    if run_gpu_tests
-        coords = CuArray([
+    for ArrayType in array_list[2:end]
+        coords = ArrayType([
             SVector(1.0, 1.0, 1.0)u"nm",
             SVector(1.6, 1.0, 1.0)u"nm",
             SVector(1.4, 1.6, 1.0)u"nm",
         ])
         sys = System(
-            atoms=CuArray([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
+            atoms=ArrayType([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
             coords=coords,
             boundary=CubicBoundary(5.0u"nm"),
             pairwise_inters=(LennardJones(),),
@@ -57,10 +57,12 @@
         sim = SteepestDescentMinimizer(tol=1.0u"kJ * mol^-1 * nm^-1")
 
         simulate!(sys, sim)
-        dists = distances(sys.coords, sys.boundary)
+        dists = Array(distances(sys.coords, sys.boundary))
         dists_flat = dists[triu(trues(3, 3), 1)]
-        @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-3u"nm"), dists_flat)
+
+        # GPU tolerances are more lenient (possibly for f32 shenanigans)
+        @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-2u"nm"), dists_flat)
         @test isapprox(potential_energy(sys), -3.0u"kJ * mol^-1";
-                        atol=1e-4u"kJ * mol^-1")
+                        atol=1e-2u"kJ * mol^-1")
     end
 end
diff --git a/test/protein.jl b/test/protein.jl
index 7153eb63..9d145d10 100644
--- a/test/protein.jl
+++ b/test/protein.jl
@@ -179,12 +179,12 @@ end
     @test pis_grad == sys_nounits.pairwise_inters
 
     # Test the same simulation on the GPU
-    if run_gpu_tests
+    for ArrayType in array_list[2:end]
         sys = System(
             joinpath(data_dir, "6mrr_equil.pdb"),
             ff;
-            velocities=CuArray(copy(velocities_start)),
-            gpu=true,
+            velocities=ArrayType(deepcopy(velocities_start)),
+            ArrayType = ArrayType,
             center_coords=false,
         )
         @test kinetic_energy(sys) ≈ 65521.87288132431u"kJ * mol^-1"
@@ -211,9 +211,9 @@ end
         sys_nounits = System(
             joinpath(data_dir, "6mrr_equil.pdb"),
             ff_nounits;
-            velocities=CuArray(copy(ustrip_vec.(velocities_start))),
+            velocities=ArrayType(deepcopy(ustrip_vec.(velocities_start))),
             units=false,
-            gpu=true,
+            ArrayType = ArrayType,
             center_coords=false,
         )
         @test kinetic_energy(sys_nounits)u"kJ * mol^-1" ≈ 65521.87288132431u"kJ * mol^-1"
@@ -248,13 +248,13 @@ end
 @testset "Implicit solvent" begin
     ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...)
 
-    for gpu in gpu_list
+    for ArrayType in array_list
         for solvent_model in ("obc2", "gbn2")
             sys = System(
                 joinpath(data_dir, "6mrr_nowater.pdb"),
                 ff;
                 boundary=CubicBoundary(100.0u"nm"),
-                gpu=gpu,
+                ArrayType = ArrayType,
                 dist_cutoff=5.0u"nm",
                 dist_neighbors=5.0u"nm",
                 implicit_solvent=solvent_model,
diff --git a/test/runtests.jl b/test/runtests.jl
index 00fc6654..3be6418e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,6 +7,8 @@ using AtomsCalculators.AtomsCalculatorsTesting
 import BioStructures # Imported to avoid clashing names
 using CUDA
 using Enzyme
+using AMDGPU
+using GPUArrays
 using FiniteDifferences
 using KernelDensity
 import SimpleCrystals
@@ -52,6 +54,7 @@ end
 # Allow CUDA device to be specified
 const DEVICE = parse(Int, get(ENV, "DEVICE", "0"))
 
+<<<<<<< HEAD
 const run_gpu_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional()
 const gpu_list = (run_gpu_tests ? (false, true) : (false,))
 if run_gpu_tests
@@ -59,8 +62,27 @@ if run_gpu_tests
     @info "The GPU tests will be run on device $DEVICE"
 elseif get(ENV, "GPUTESTS", "1") == "0"
     @warn "The GPU tests will not be run as GPUTESTS is set to 0"
+=======
+const run_cuda_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional()
+const run_rocm_tests = get(ENV, "GPUTESTS", "1") != "0" && AMDGPU.functional()
+
+array_list = (Array,)
+
+if run_cuda_tests
+    array_list = (array_list..., CuArray)
+    device!(parse(Int, DEVICE))
+    @info "The CUDA tests will be run on device $DEVICE"
+>>>>>>> c820f41f (Adding KernelAbstractions tooling for Molly and tests)
+else
+    @warn "The CUDA tests will not be run as a CUDA-enabled device is not available"
+end
+
+if run_rocm_tests
+    array_list = (array_list..., ROCArray)
+    AMDGPU.device!(AMDGPU.devices()[parse(Int, DEVICE)+1])
+    @info "The ROCM tests will be run on device $DEVICE"
 else
-    @warn "The GPU tests will not be run as a CUDA-enabled device is not available"
+    @warn "The ROCM tests will not be run as a ROCM-enabled device is not available"
 end
 
 const data_dir = normpath(@__DIR__, "..", "data")
diff --git a/test/simulation.jl b/test/simulation.jl
index c9ada6ef..4a74055f 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -424,7 +424,7 @@ end
 end
 
 @testset "Position restraints" begin
-    for gpu in gpu_list
+    for ArrayType in array_list
         n_atoms = 10
         n_atoms_res = n_atoms ÷ 2
         n_steps = 2_000
@@ -435,8 +435,8 @@ end
         sim = Langevin(dt=0.001u"ps", temperature=300.0u"K", friction=1.0u"ps^-1")
 
         sys = System(
-            atoms=(gpu ? CuArray(atoms) : atoms),
-            coords=(gpu ? CuArray(copy(starting_coords)) : copy(starting_coords)),
+            atoms=ArrayType(atoms),
+            coords=ArrayType(deepcopy(starting_coords)),
             boundary=boundary,
             atoms_data=atoms_data,
             pairwise_inters=(LennardJones(),),
@@ -925,15 +925,14 @@ end
     vvand_baro = VelocityVerlet(dt=dt, coupling=(AndersenThermostat(temp, 1.0u"ps"), barostat))
 
     for sim in (lang_baro, vvand_baro)
-        for gpu in gpu_list
-            if gpu && sim == vvand_baro
+        for ArrayType in array_list
+            if ArrayType <: AbstractGPUArray && sim == vvand_baro
                 continue
             end
-            AT = gpu ? CuArray : Array
 
             sys = System(
-                atoms=AT(atoms),
-                coords=AT(copy(coords)),
+                atoms=ArrayType(atoms),
+                coords=ArrayType(deepcopy(coords)),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(),),
                 loggers=(
@@ -988,16 +987,15 @@ end
         SVector(nothing  , nothing  , nothing  ), # Uncoupled
     )
 
-    for gpu in gpu_list
-        AT = gpu ? CuArray : Array
+    for ArrayType in array_list
         for (press_i, press) in enumerate(pressure_test_set)
-            if gpu && press_i != 2
+            if ArrayType <: AbstractGPUArray && press_i != 2
                 continue
             end
 
             sys = System(
-                atoms=AT(atoms),
-                coords=AT(copy(coords)),
+                atoms=ArrayType(atoms),
+                coords=ArrayType(deepcopy(coords)),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(),),
                 loggers=(
@@ -1056,16 +1054,15 @@ end
         MonteCarloMembraneBarostat(press, tens, temp, boundary; z_axis_fixed=true),
     )
 
-    for gpu in gpu_list
-        AT = gpu ? CuArray : Array
+    for ArrayType in array_list
         for (barostat_i, barostat) in enumerate(barostat_test_set)
-            if gpu && barostat_i != 2
+            if ArrayType <: AbstractGPUArray && barostat_i != 2
                 continue
             end
 
             sys = System(
-                atoms=AT(atoms),
-                coords=AT(copy(coords)),
+                atoms=ArrayType(atoms),
+                coords=ArrayType(deepcopy(coords)),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(),),
                 loggers=(
@@ -1179,7 +1176,8 @@ end
     starting_coords_f32 = [Float32.(c) for c in starting_coords]
     starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-    function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+    function test_sim(nl::Bool, parallel::Bool, f32::Bool,
+                      ArrayType::Type{AT}) where AT <: AbstractArray
         n_atoms = 400
         n_steps = 200
         atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -1189,9 +1187,9 @@ end
         r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
         bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
         specific_inter_lists = (InteractionList2Atoms(
-            gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-            gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-            gpu ? CuArray(bonds) : bonds,
+            ArrayType(Int32.(collect(1:2:n_atoms))),
+            ArrayType(Int32.(collect(2:2:n_atoms))),
+            ArrayType(bonds),
         ),)
 
         neighbor_finder = NoNeighborFinder()
@@ -1199,7 +1197,7 @@ end
         pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
         if nl
             neighbor_finder = DistanceNeighborFinder(
-                eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+                eligible=ArrayType(trues(n_atoms, n_atoms)),
                 n_steps=10,
                 dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
             )
@@ -1207,17 +1205,10 @@ end
         end
         show(devnull, neighbor_finder)
 
-        if gpu
-            coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
-            velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
-            atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                                  ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-        else
-            coords = copy(f32 ? starting_coords_f32 : starting_coords)
-            velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
-            atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                            ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-        end
+        coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords))
+        velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
+        atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                                ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
         s = System(
             atoms=atoms,
@@ -1229,7 +1220,7 @@ end
             neighbor_finder=neighbor_finder,
         )
 
-        @test is_on_gpu(s) == gpu
+        @test is_on_gpu(s) == (ArrayType <: AbstractGPUArray)
         @test float_type(s) == (f32 ? Float32 : Float64)
 
         n_threads = parallel ? Threads.nthreads() : 1
@@ -1240,23 +1231,30 @@ end
     end
 
     runs = [
-        ("CPU"       , [false, false, false, false]),
-        ("CPU f32"   , [false, false, true , false]),
-        ("CPU NL"    , [true , false, false, false]),
-        ("CPU f32 NL", [true , false, true , false]),
+        ("CPU"       , [false, false, false, Array]),
+        ("CPU f32"   , [false, false, true , Array]),
+        ("CPU NL"    , [true , false, false, Array]),
+        ("CPU f32 NL", [true , false, true , Array]),
     ]
     if run_parallel_tests
-        push!(runs, ("CPU parallel"       , [false, true , false, false]))
-        push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-        push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-        push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+        push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+        push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+        push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+        push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
     end
-    if run_gpu_tests
-        push!(runs, ("GPU"       , [false, false, false, true]))
-        push!(runs, ("GPU f32"   , [false, false, true , true]))
-        push!(runs, ("GPU NL"    , [true , false, false, true]))
-        push!(runs, ("GPU f32 NL", [true , false, true , true]))
+    if run_cuda_tests
+        push!(runs, ("GPU"       , [false, false, false, CuArray]))
+        push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
+        push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
+        push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
     end
+    if run_rocm_tests
+        push!(runs, ("GPU"       , [false, false, false, ROCArray]))
+        push!(runs, ("GPU f32"   , [false, false, true , ROCArray]))
+        push!(runs, ("GPU NL"    , [true , false, false, ROCArray]))
+        push!(runs, ("GPU f32 NL", [true , false, true , ROCArray]))
+    end
+
 
     final_coords_ref, E_start_ref = test_sim(runs[1][2]...)
     # Check all simulations give the same result to within some error