From 7575bdfc8ba3e76a2869af4627e3357506c9f8be Mon Sep 17 00:00:00 2001 From: Kai Partmann Date: Mon, 15 Jul 2024 16:06:17 +0200 Subject: [PATCH 1/4] Additional backend types to support Base.Threads --- src/PointNeighbors.jl | 1 + src/neighborhood_search.jl | 16 +++++++++-- src/util.jl | 56 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index b7a8203..ec32a65 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -24,6 +24,7 @@ export TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoo export DictionaryCellList, FullGridCellList export ParallelUpdate, SemiParallelUpdate, SerialUpdate export initialize!, update!, initialize_grid!, update_grid! +export PolyesterBackend, ThreadsDynamicBackend, ThreadsStaticBackend export PeriodicBox, copy_neighborhood_search end # module PointNeighbors diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index dacbc10..ea43406 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -36,6 +36,16 @@ in this case to avoid unnecessary updates. The first flag in `points_moving` indicates if points in `x` are moving. The second flag indicates if points in `y` are moving. +!!! warning "Experimental feature: Backend specification" + The keyword argument `parallelization_backend` allows users to specify the + multithreading backend. This feature is currently considered experimental! + + Possible parallelization backends are: + - [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic` + - [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static` + - [`PolyesterBackend`](@ref) to use `Polyester.@batch` + - [`KernelAbstractions.Backend`](@ref) to execute the update inside a GPU kernel + See also [`initialize!`](@ref). """ @inline function update!(search::AbstractNeighborhoodSearch, x, y; @@ -130,7 +140,7 @@ Note that `system_coords` and `neighbor_coords` can be identical. See also [`initialize!`](@ref), [`update!`](@ref). """ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search; - parallel::Union{Bool, KernelAbstractions.Backend} = true, + parallel::Union{Bool, ParallelizationBackend} = true, points = axes(system_coords, 2)) where {T} # The type annotation above is to make Julia specialize on the type of the function. # Otherwise, unspecialized code will cause a lot of allocations @@ -141,7 +151,7 @@ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborho # threaded loop with `Polyester.@batch`, or, when `system_coords` is a GPU array, # launch the loop as a kernel on the GPU. parallel_ = Val(parallel) - elseif parallel isa KernelAbstractions.Backend + elseif parallel isa ParallelizationBackend # WARNING! Undocumented, experimental feature: # When a `KernelAbstractions.Backend` is passed, launch the loop as a GPU kernel # on this backend. This is useful to test the GPU code on the CPU by passing @@ -165,7 +175,7 @@ end # When a `KernelAbstractions.Backend` is passed, launch a GPU kernel on this backend @inline function foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, - backend::KernelAbstractions.Backend) + backend::ParallelizationBackend) @threaded backend for point in points foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) end diff --git a/src/util.jl b/src/util.jl index d979ac2..8af9de6 100644 --- a/src/util.jl +++ b/src/util.jl @@ -22,6 +22,38 @@ end return floor(Int, i) end +abstract type AbstractUserBackend end + +""" + PolyesterBackend() + +A type to specify the parallelization backend when using [`@threaded`](@ref). If this type +is specified, the @threaded macro will use `Polyester.@batch` for the multithreaded `for` +loop. +""" +struct PolyesterBackend <: AbstractUserBackend end + +""" + ThreadsDynamicBackend() + +A type to specify the parallelization backend when using [`@threaded`](@ref). If this type +is specified, the @threaded macro will use `Threads.@threads :dynamic` for the multithreaded +`for` loop. +""" +struct ThreadsDynamicBackend <: AbstractUserBackend end + +""" + ThreadsStaticBackend() + + +A type to specify the parallelization backend when using [`@threaded`](@ref). If this type +is specified, the @threaded macro will use `Threads.@threads :static` for the multithreaded +`for` loop. +""" +struct ThreadsStaticBackend <: AbstractUserBackend end + +const ParallelizationBackend = Union{AbstractUserBackend, KernelAbstractions.Backend} + """ @threaded x for ... end @@ -30,15 +62,21 @@ Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitR but without guarantee that the underlying implementation uses `Threads.@threads` or works for more general `for` loops. -The first argument must either be a `KernelAbstractions.Backend` or an array from which the +The first argument must either be a parallelization backend or an array from which the backend can be derived to determine if the loop must be run threaded on the CPU or launched as a kernel on the GPU. Passing `KernelAbstractions.CPU()` will run the GPU kernel on the CPU. +Possible parallelization backends are: +- [`PolyesterBackend`](@ref) to use `Polyester.@batch` +- [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic` +- [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static` +- [`KernelAbstractions.Backend`](@ref) to execute the loop inside a GPU kernel + In particular, the underlying threading capabilities might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). -!!! warn +!!! warning "Not suited for general loops" This macro does not necessarily work for general `for` loops. For example, it does not necessarily support general iterables such as `eachline(filename)`. """ @@ -67,6 +105,20 @@ end end end +# Use `Threads.@threads :dynamic` +@inline function parallel_foreach(f, iterator, x::ThreadsDynamicBackend) + Threads.@threads :dynamic for i in iterator + @inline f(i) + end +end + +# Use `Threads.@threads :static` +@inline function parallel_foreach(f, iterator, x::ThreadsStaticBackend) + Threads.@threads :static for i in iterator + @inline f(i) + end +end + # On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl @inline function parallel_foreach(f, iterator, x::Union{AbstractGPUArray, KernelAbstractions.Backend}) From eac2d0bf61ea308b0f439013e45ec2c728114f1b Mon Sep 17 00:00:00 2001 From: Kai Partmann Date: Tue, 16 Jul 2024 14:10:04 +0200 Subject: [PATCH 2/4] Removed reference to `KernelAbstractions.Backend` in docstrings --- src/neighborhood_search.jl | 2 +- src/util.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index ea43406..f0e767a 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -44,7 +44,7 @@ The second flag indicates if points in `y` are moving. - [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic` - [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static` - [`PolyesterBackend`](@ref) to use `Polyester.@batch` - - [`KernelAbstractions.Backend`](@ref) to execute the update inside a GPU kernel + - `KernelAbstractions.Backend` to execute the update inside a GPU kernel See also [`initialize!`](@ref). """ diff --git a/src/util.jl b/src/util.jl index 8af9de6..dbe6955 100644 --- a/src/util.jl +++ b/src/util.jl @@ -71,7 +71,7 @@ Possible parallelization backends are: - [`PolyesterBackend`](@ref) to use `Polyester.@batch` - [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic` - [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static` -- [`KernelAbstractions.Backend`](@ref) to execute the loop inside a GPU kernel +- `KernelAbstractions.Backend` to execute the loop inside a GPU kernel In particular, the underlying threading capabilities might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). From 5aa0495524ed648af15c2a3712f93c3dc246db71 Mon Sep 17 00:00:00 2001 From: Kai Partmann Date: Tue, 23 Jul 2024 18:40:02 +0200 Subject: [PATCH 3/4] Fixed suggestions --- src/neighborhood_search.jl | 5 ++--- src/util.jl | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index f0e767a..4f1ba6b 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -36,7 +36,7 @@ in this case to avoid unnecessary updates. The first flag in `points_moving` indicates if points in `x` are moving. The second flag indicates if points in `y` are moving. -!!! warning "Experimental feature: Backend specification" +!!! warning "Experimental Feature: Backend Specification" The keyword argument `parallelization_backend` allows users to specify the multithreading backend. This feature is currently considered experimental! @@ -44,7 +44,7 @@ The second flag indicates if points in `y` are moving. - [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic` - [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static` - [`PolyesterBackend`](@ref) to use `Polyester.@batch` - - `KernelAbstractions.Backend` to execute the update inside a GPU kernel + - `KernelAbstractions.Backend` to launch a GPU kernel See also [`initialize!`](@ref). """ @@ -152,7 +152,6 @@ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborho # launch the loop as a kernel on the GPU. parallel_ = Val(parallel) elseif parallel isa ParallelizationBackend - # WARNING! Undocumented, experimental feature: # When a `KernelAbstractions.Backend` is passed, launch the loop as a GPU kernel # on this backend. This is useful to test the GPU code on the CPU by passing # `parallel = KernelAbstractions.CPU()`, even though `system_coords isa Array`. diff --git a/src/util.jl b/src/util.jl index dbe6955..d172204 100644 --- a/src/util.jl +++ b/src/util.jl @@ -22,16 +22,15 @@ end return floor(Int, i) end -abstract type AbstractUserBackend end +abstract type AbstractThreadingBackend end """ PolyesterBackend() -A type to specify the parallelization backend when using [`@threaded`](@ref). If this type -is specified, the @threaded macro will use `Polyester.@batch` for the multithreaded `for` -loop. +Pass as first argument to the [`@threaded`](@ref) macro to make the loop multithreaded +with `Polyester.@batch`. """ -struct PolyesterBackend <: AbstractUserBackend end +struct PolyesterBackend <: AbstractThreadingBackend end """ ThreadsDynamicBackend() @@ -40,7 +39,7 @@ A type to specify the parallelization backend when using [`@threaded`](@ref). If is specified, the @threaded macro will use `Threads.@threads :dynamic` for the multithreaded `for` loop. """ -struct ThreadsDynamicBackend <: AbstractUserBackend end +struct ThreadsDynamicBackend <: AbstractThreadingBackend end """ ThreadsStaticBackend() @@ -50,9 +49,9 @@ A type to specify the parallelization backend when using [`@threaded`](@ref). If is specified, the @threaded macro will use `Threads.@threads :static` for the multithreaded `for` loop. """ -struct ThreadsStaticBackend <: AbstractUserBackend end +struct ThreadsStaticBackend <: AbstractThreadingBackend end -const ParallelizationBackend = Union{AbstractUserBackend, KernelAbstractions.Backend} +const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend} """ @threaded x for ... end @@ -62,8 +61,8 @@ Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitR but without guarantee that the underlying implementation uses `Threads.@threads` or works for more general `for` loops. -The first argument must either be a parallelization backend or an array from which the -backend can be derived to determine if the loop must be run threaded on the CPU +The first argument must either be a parallelization backend (see below) or an array from +which the backend can be derived to determine if the loop must be run threaded on the CPU or launched as a kernel on the GPU. Passing `KernelAbstractions.CPU()` will run the GPU kernel on the CPU. @@ -71,12 +70,12 @@ Possible parallelization backends are: - [`PolyesterBackend`](@ref) to use `Polyester.@batch` - [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic` - [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static` -- `KernelAbstractions.Backend` to execute the loop inside a GPU kernel +- `KernelAbstractions.Backend` to execute the loop as a GPU kernel In particular, the underlying threading capabilities might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). -!!! warning "Not suited for general loops" +!!! warning "Warning" This macro does not necessarily work for general `for` loops. For example, it does not necessarily support general iterables such as `eachline(filename)`. """ @@ -99,6 +98,7 @@ macro threaded(system, expr) end # Use `Polyester.@batch` for low-overhead threading +# This is currently the default when x::Array @inline function parallel_foreach(f, iterator, x) Polyester.@batch for i in iterator @inline f(i) From ac5afc335d8bafecf89b089fbdb724b54b7fcd5c Mon Sep 17 00:00:00 2001 From: Kai Partmann Date: Wed, 24 Jul 2024 11:38:27 +0200 Subject: [PATCH 4/4] Consistency in docstrings --- src/util.jl | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/util.jl b/src/util.jl index d172204..c44e6ff 100644 --- a/src/util.jl +++ b/src/util.jl @@ -35,9 +35,8 @@ struct PolyesterBackend <: AbstractThreadingBackend end """ ThreadsDynamicBackend() -A type to specify the parallelization backend when using [`@threaded`](@ref). If this type -is specified, the @threaded macro will use `Threads.@threads :dynamic` for the multithreaded -`for` loop. +Pass as first argument to the [`@threaded`](@ref) macro to make the loop multithreaded +with `Threads.@threads :dynamic`. """ struct ThreadsDynamicBackend <: AbstractThreadingBackend end @@ -45,9 +44,8 @@ struct ThreadsDynamicBackend <: AbstractThreadingBackend end ThreadsStaticBackend() -A type to specify the parallelization backend when using [`@threaded`](@ref). If this type -is specified, the @threaded macro will use `Threads.@threads :static` for the multithreaded -`for` loop. +Pass as first argument to the [`@threaded`](@ref) macro to make the loop multithreaded +with `Threads.@threads :static`. """ struct ThreadsStaticBackend <: AbstractThreadingBackend end