From 17601375868c6473dbd5b4f162260a48bd160e1b Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Mon, 11 Mar 2024 09:33:53 -0400 Subject: [PATCH] Improve CPU performance implementation --- test/utils.jl | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/test/utils.jl b/test/utils.jl index a31f4eb..b44af9c 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -45,13 +45,25 @@ if !hasmethod(Base.copyto!, Tuple{<:FusedMultiBroadcast}) end end +# This is better than the baseline. function copyto_cpu!(pairs::T, ei::EI) where {T, EI} - @inbounds @simd ivdep for i in ei - MBF.rcopyto_at!(pairs, i) + for (dest, bc) in pairs + @inbounds @simd ivdep for i in ei + dest[i] = bc[i] + end end return nothing end +# This should, in theory be better, but it seems like inlining is +# failing somewhere. +# function copyto_cpu!(pairs::T, ei::EI) where {T, EI} +# @inbounds @simd ivdep for i in ei +# MBF.rcopyto_at!(pairs, i) +# end +# return nothing +# end + import CUDA import Adapt function copyto_cuda!(pairs::Tuple) # (Pair(dest1, bc1),Pair(dest2, bc2),...)