diff --git a/Derivs/src/derivs.cxx b/Derivs/src/derivs.cxx index 43dc63eef..a5a2c98b2 100644 --- a/Derivs/src/derivs.cxx +++ b/Derivs/src/derivs.cxx @@ -10,9 +10,29 @@ using namespace Loop; template CCTK_ATTRIBUTE_NOINLINE void -calc_derivs(const vec, dim> &dgf, const GridDescBaseDevice &grid, - const GF3D5 &gf, const GF3D5layout layout, - const vect dx, const int deriv_order) { +calc_copy(const GF3D5 &gf, const GF3D5layout layout, + const GridDescBaseDevice &grid, const GF3D2 &gf0, + const vect dx) { + using vreal = simd; + using vbool = simdl; + constexpr std::size_t vsize = std::tuple_size_v; + + grid.loop_int_device( + grid.nghostzones, + [=] CCTK_DEVICE(const PointDesc &p) CCTK_ATTRIBUTE_ALWAYS_INLINE { + const vbool mask = mask_for_loop_tail(p.i, p.imax); + const GF3D5index index(layout, p.I); + const auto val = gf0(mask, p.I); + gf.store(mask, index, val); + }); +} + +template +CCTK_ATTRIBUTE_NOINLINE void +calc_derivs(const GF3D5 &gf, const vec, dim> &dgf, + const GF3D5layout layout, const GridDescBaseDevice &grid, + const GF3D2 &gf0, const vect dx, + const int deriv_order) { using vreal = simd; using vbool = simdl; constexpr std::size_t vsize = std::tuple_size_v; @@ -25,7 +45,9 @@ calc_derivs(const vec, dim> &dgf, const GridDescBaseDevice &grid, [=] CCTK_DEVICE(const PointDesc &p) CCTK_ATTRIBUTE_ALWAYS_INLINE { const vbool mask = mask_for_loop_tail(p.i, p.imax); const GF3D5index index(layout, p.I); - const auto dval = calc_deriv<2>(gf, mask, layout, p.I, dx); + const auto val = gf0(mask, p.I); + const auto dval = calc_deriv<2>(gf0, mask, p.I, dx); + gf.store(mask, index, val); dgf.store(mask, index, dval); }); break; @@ -36,7 +58,9 @@ calc_derivs(const vec, dim> &dgf, const GridDescBaseDevice &grid, [=] CCTK_DEVICE(const PointDesc &p) CCTK_ATTRIBUTE_ALWAYS_INLINE { const vbool mask = mask_for_loop_tail(p.i, p.imax); const GF3D5index index(layout, p.I); - const auto dval = calc_deriv<4>(gf, mask, layout, p.I, dx); + const auto val = gf0(mask, p.I); + const auto dval = calc_deriv<4>(gf0, mask, p.I, dx); + gf.store(mask, index, val); dgf.store(mask, index, dval); }); break; @@ -48,10 +72,10 @@ calc_derivs(const vec, dim> &dgf, const GridDescBaseDevice &grid, template CCTK_ATTRIBUTE_NOINLINE void -calc_derivs2(const vec, dim> &dgf, const smat, dim> &ddgf, - const GridDescBaseDevice &grid, const GF3D5 &gf, - const GF3D5layout layout, const vect dx, - const int deriv_order) { +calc_derivs2(const GF3D5 &gf, const vec, dim> &dgf, + const smat, dim> &ddgf, const GF3D5layout layout, + const GridDescBaseDevice &grid, const GF3D2 &gf0, + const vect dx, const int deriv_order) { using vreal = simd; using vbool = simdl; constexpr std::size_t vsize = std::tuple_size_v; @@ -64,8 +88,10 @@ calc_derivs2(const vec, dim> &dgf, const smat, dim> &ddgf, [=] CCTK_DEVICE(const PointDesc &p) CCTK_ATTRIBUTE_ALWAYS_INLINE { const vbool mask = mask_for_loop_tail(p.i, p.imax); const GF3D5index index(layout, p.I); - const auto dval = calc_deriv<2>(gf, mask, layout, p.I, dx); - const auto ddval = calc_deriv2<2>(gf, mask, layout, p.I, dx); + const auto val = gf0(mask, p.I); + const auto dval = calc_deriv<2>(gf0, mask, p.I, dx); + const auto ddval = calc_deriv2<2>(gf0, mask, p.I, dx); + gf.store(mask, index, val); dgf.store(mask, index, dval); ddgf.store(mask, index, ddval); }); @@ -77,8 +103,10 @@ calc_derivs2(const vec, dim> &dgf, const smat, dim> &ddgf, [=] CCTK_DEVICE(const PointDesc &p) CCTK_ATTRIBUTE_ALWAYS_INLINE { const vbool mask = mask_for_loop_tail(p.i, p.imax); const GF3D5index index(layout, p.I); - const auto dval = calc_deriv<4>(gf, mask, layout, p.I, dx); - const auto ddval = calc_deriv2<4>(gf, mask, layout, p.I, dx); + const auto val = gf0(mask, p.I); + const auto dval = calc_deriv<4>(gf0, mask, p.I, dx); + const auto ddval = calc_deriv2<4>(gf0, mask, p.I, dx); + gf.store(mask, index, val); dgf.store(mask, index, dval); ddgf.store(mask, index, ddval); }); @@ -96,55 +124,56 @@ calc_derivs2(const vec, dim> &dgf, const smat, dim> &ddgf, using T = CCTK_REAL; template CCTK_DEVICE CCTK_HOST Arith::vec, Loop::dim> -calc_deriv<2>(const Loop::GF3D5 &gf, const Arith::simdl &mask, - const Loop::GF3D5layout &layout, +calc_deriv<2>(const Loop::GF3D2 &gf, const Arith::simdl &mask, const Arith::vect &I, const Arith::vect &dx); template CCTK_DEVICE CCTK_HOST Arith::vec, Loop::dim> -calc_deriv<4>(const Loop::GF3D5 &gf, const Arith::simdl &mask, - const Loop::GF3D5layout &layout, +calc_deriv<4>(const Loop::GF3D2 &gf, const Arith::simdl &mask, const Arith::vect &I, const Arith::vect &dx); template CCTK_DEVICE CCTK_HOST Arith::vec -calc_deriv<2>(const Loop::GF3D5 &gf, const Loop::GF3D5layout &layout, +calc_deriv<2>(const Loop::GF3D2 &gf, const Arith::vect &I, const Arith::vect &dx); template CCTK_DEVICE CCTK_HOST Arith::vec -calc_deriv<4>(const Loop::GF3D5 &gf, const Loop::GF3D5layout &layout, +calc_deriv<4>(const Loop::GF3D2 &gf, const Arith::vect &I, const Arith::vect &dx); template CCTK_DEVICE CCTK_HOST Arith::smat, Loop::dim> -calc_deriv2<2>(const Loop::GF3D5 &gf, const Arith::simdl &mask, - const Loop::GF3D5layout &layout, +calc_deriv2<2>(const Loop::GF3D2 &gf, const Arith::simdl &mask, const Arith::vect &I, const Arith::vect &dx); template CCTK_DEVICE CCTK_HOST Arith::smat, Loop::dim> -calc_deriv2<4>(const Loop::GF3D5 &gf, const Arith::simdl &mask, - const Loop::GF3D5layout &layout, +calc_deriv2<4>(const Loop::GF3D2 &gf, const Arith::simdl &mask, const Arith::vect &I, const Arith::vect &dx); template CCTK_DEVICE CCTK_HOST Arith::smat -calc_deriv2<2>(const Loop::GF3D5 &gf, const Loop::GF3D5layout &layout, +calc_deriv2<2>(const Loop::GF3D2 &gf, const Arith::vect &I, const Arith::vect &dx); template CCTK_DEVICE CCTK_HOST Arith::smat -calc_deriv2<4>(const Loop::GF3D5 &gf, const Loop::GF3D5layout &layout, +calc_deriv2<4>(const Loop::GF3D2 &gf, const Arith::vect &I, const Arith::vect &dx); -template void calc_derivs<0, 0, 0>(const vec, dim> &dgf, - const GridDescBaseDevice &grid, - const GF3D5 &gf, - const GF3D5layout layout, - const vect dx, - const int deriv_order); - -template void calc_derivs2<0, 0, 0>( - const vec, dim> &dgf, const smat, dim> &ddgf, - const GridDescBaseDevice &grid, const GF3D5 &gf, - const GF3D5layout layout, const vect dx, const int deriv_order); +template void calc_copy<0, 0, 0>(const GF3D5 &gf, const GF3D5layout layout, + const GridDescBaseDevice &grid, + const GF3D2 &gf0, + const vect dx); + +template void +calc_derivs<0, 0, 0>(const GF3D5 &gf, const vec, dim> &dgf, + const GF3D5layout layout, const GridDescBaseDevice &grid, + const GF3D2 &gf0, const vect dx, + const int deriv_order); + +template void +calc_derivs2<0, 0, 0>(const GF3D5 &gf, const vec, dim> &dgf, + const smat, dim> &ddgf, const GF3D5layout layout, + const GridDescBaseDevice &grid, const GF3D2 &gf0, + const vect dx, const int deriv_order); } // namespace Derivs diff --git a/Derivs/src/derivs.hxx b/Derivs/src/derivs.hxx index d7a54c26f..d0b931cc7 100644 --- a/Derivs/src/derivs.hxx +++ b/Derivs/src/derivs.hxx @@ -332,19 +332,18 @@ inline CCTK_ATTRIBUTE_ALWAYS_INLINE template inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE CCTK_HOST Arith::vec, Loop::dim> - calc_deriv(const Loop::GF3D5 &gf, const Arith::simdl &mask, - const Loop::GF3D5layout &layout, + calc_deriv(const Loop::GF3D2 &gf, const Arith::simdl &mask, const Arith::vect &I, const Arith::vect &dx) { using namespace Arith; using namespace Loop; // We use explicit index calculations to avoid unnecessary integer // multiplications - const T *restrict const ptr = &gf(layout, I); + const T *restrict const ptr = &gf(I); const std::array offsets{ - layout.delta(1, 0, 0), - layout.delta(0, 1, 0), - layout.delta(0, 0, 1), + gf.delta(1, 0, 0), + gf.delta(0, 1, 0), + gf.delta(0, 0, 1), }; return { detail::deriv1d( @@ -368,18 +367,18 @@ inline CCTK_ATTRIBUTE_ALWAYS_INLINE template inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE CCTK_HOST Arith::vec - calc_deriv(const Loop::GF3D5 &gf, const Loop::GF3D5layout &layout, + calc_deriv(const Loop::GF3D2 &gf, const Arith::vect &I, const Arith::vect &dx) { using namespace Arith; using namespace Loop; // We use explicit index calculations to avoid unnecessary integer // multiplications - const T *restrict const ptr = &gf(layout, I); + const T *restrict const ptr = &gf(I); const std::array offsets{ - layout.delta(1, 0, 0), - layout.delta(0, 1, 0), - layout.delta(0, 0, 1), + gf.delta(1, 0, 0), + gf.delta(0, 1, 0), + gf.delta(0, 0, 1), }; return { detail::deriv1d( @@ -400,19 +399,18 @@ inline CCTK_ATTRIBUTE_ALWAYS_INLINE template inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE CCTK_HOST Arith::smat, Loop::dim> - calc_deriv2(const Loop::GF3D5 &gf, const Arith::simdl &mask, - const Loop::GF3D5layout &layout, + calc_deriv2(const Loop::GF3D2 &gf, const Arith::simdl &mask, const Arith::vect &I, const Arith::vect &dx) { using namespace Arith; using namespace Loop; // We use explicit index calculations to avoid unnecessary integer // multiplications - const T *restrict const ptr = &gf(layout, I); + const T *restrict const ptr = &gf(I); const std::array offsets{ - layout.delta(1, 0, 0), - layout.delta(0, 1, 0), - layout.delta(0, 0, 1), + gf.delta(1, 0, 0), + gf.delta(0, 1, 0), + gf.delta(0, 0, 1), }; return { detail::deriv2_1d( @@ -451,18 +449,18 @@ inline CCTK_ATTRIBUTE_ALWAYS_INLINE template inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE CCTK_HOST Arith::smat - calc_deriv2(const Loop::GF3D5 &gf, const Loop::GF3D5layout &layout, + calc_deriv2(const Loop::GF3D2 &gf, const Arith::vect &I, const Arith::vect &dx) { using namespace Arith; using namespace Loop; // We use explicit index calculations to avoid unnecessary integer // multiplications - const T *restrict const ptr = &gf(layout, I); + const T *restrict const ptr = &gf(I); const std::array offsets{ - layout.delta(1, 0, 0), - layout.delta(0, 1, 0), - layout.delta(0, 0, 1), + gf.delta(1, 0, 0), + gf.delta(0, 1, 0), + gf.delta(0, 0, 1), }; return { detail::deriv2_1d( @@ -501,18 +499,24 @@ inline CCTK_ATTRIBUTE_ALWAYS_INLINE template CCTK_ATTRIBUTE_NOINLINE void -calc_derivs(const Arith::vec, Loop::dim> &dgf, - const Loop::GridDescBaseDevice &grid, - const Loop::GF3D5 &gf, const Loop::GF3D5layout layout, - const Arith::vect dx, const int deriv_order); +calc_copy(const Loop::GF3D5 &gf, const Loop::GF3D5layout layout, + const Loop::GridDescBaseDevice &grid, const Loop::GF3D2 &gf0, + const Arith::vect dx); template -CCTK_ATTRIBUTE_NOINLINE void -calc_derivs2(const Arith::vec, Loop::dim> &dgf, - const Arith::smat, Loop::dim> &ddgf, - const Loop::GridDescBaseDevice &grid, - const Loop::GF3D5 &gf, const Loop::GF3D5layout layout, - const Arith::vect dx, const int deriv_order); +CCTK_ATTRIBUTE_NOINLINE void calc_derivs( + const Loop::GF3D5 &gf, const Arith::vec, Loop::dim> &dgf, + const Loop::GF3D5layout layout, const Loop::GridDescBaseDevice &grid, + const Loop::GF3D2 &gf0, const Arith::vect dx, + const int deriv_order); + +template +CCTK_ATTRIBUTE_NOINLINE void calc_derivs2( + const Loop::GF3D5 &gf, const Arith::vec, Loop::dim> &dgf, + const Arith::smat, Loop::dim> &ddgf, + const Loop::GF3D5layout layout, const Loop::GridDescBaseDevice &grid, + const Loop::GF3D2 &gf0, const Arith::vect dx, + const int deriv_order); } // namespace Derivs