From a44955dd2ebcc330fc3415e921e5ed02764cbade Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 11 Sep 2024 09:20:36 -0600 Subject: [PATCH 01/15] Update Kokkos library in LAMMPS to v4.4.0 --- lib/kokkos/ | 78 +- lib/kokkos/CITATION.cff | 65 + lib/kokkos/CMakeLists.txt | 4 +- lib/kokkos/Makefile.kokkos | 64 +- lib/kokkos/Makefile.targets | 2 +- lib/kokkos/ | 63 +- .../src/sorting/impl/Kokkos_SortByKeyImpl.hpp | 55 +- .../src/std_algorithms/Kokkos_ForEach.hpp | 56 +- .../impl/Kokkos_AdjacentDifference.hpp | 10 + .../impl/Kokkos_Constraints.hpp | 61 +- .../src/std_algorithms/impl/Kokkos_CopyIf.hpp | 5 +- .../impl/Kokkos_ForEachForEachN.hpp | 20 +- .../impl/Kokkos_RandomAccessIterator.hpp | 31 + .../std_algorithms/impl/Kokkos_UniqueCopy.hpp | 5 +- .../unit_tests/TestRandomAccessIterator.cpp | 38 + .../algorithms/unit_tests/TestSortByKey.hpp | 14 +- .../TestStdAlgorithmsConstraints.cpp | 109 + .../TestStdAlgorithmsTeamExclusiveScan.cpp | 6 +- .../TestStdAlgorithmsTeamIsSorted.cpp | 4 +- .../TestStdAlgorithmsTeamIsSortedUntil.cpp | 4 +- .../TestStdAlgorithmsTeamMaxElement.cpp | 4 +- .../TestStdAlgorithmsTeamMinElement.cpp | 4 +- .../TestStdAlgorithmsTeamMinMaxElement.cpp | 4 +- .../TestStdAlgorithmsTeamReduce.cpp | 2 +- ...tdAlgorithmsTeamTransformExclusiveScan.cpp | 2 +- ...tdAlgorithmsTeamTransformInclusiveScan.cpp | 2 +- .../TestStdAlgorithmsTeamTransformReduce.cpp | 2 +- lib/kokkos/appveyor.yml | 2 +- lib/kokkos/benchmarks/CMakeLists.txt | 2 +- .../view_copy_constructor/CMakeLists.txt | 4 + .../benchmarks/view_copy_constructor/Makefile | 46 + .../view_copy_constructor.cpp | 310 +++ lib/kokkos/bin/nvcc_wrapper | 2 +- lib/kokkos/cmake/Dependencies.cmake | 1 - lib/kokkos/cmake/ | 7 +- lib/kokkos/cmake/ | 4 +- lib/kokkos/cmake/Modules/FindTPLCUDA.cmake | 57 +- lib/kokkos/cmake/deps/CUDA.cmake | 1 - lib/kokkos/cmake/deps/CUSPARSE.cmake | 26 - lib/kokkos/cmake/fake_tribits.cmake | 8 - lib/kokkos/cmake/kokkos_arch.cmake | 86 +- lib/kokkos/cmake/kokkos_compiler_id.cmake | 83 +- lib/kokkos/cmake/kokkos_enable_options.cmake | 6 +- lib/kokkos/cmake/kokkos_functions.cmake | 7 +- lib/kokkos/cmake/kokkos_test_cxx_std.cmake | 8 +- lib/kokkos/cmake/kokkos_tpls.cmake | 10 +- lib/kokkos/cmake/kokkos_tribits.cmake | 47 +- lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake | 26 - lib/kokkos/containers/src/Kokkos_DualView.hpp | 58 +- .../containers/src/Kokkos_DynRankView.hpp | 410 ++-- .../containers/src/Kokkos_DynamicView.hpp | 333 ++-- .../containers/src/Kokkos_OffsetView.hpp | 263 ++- .../containers/src/Kokkos_UnorderedMap.hpp | 114 +- .../containers/unit_tests/TestDualView.hpp | 140 +- .../unit_tests/TestUnorderedMap.hpp | 7 +- .../containers/unit_tests/TestVector.hpp | 4 +- lib/kokkos/core/perf_test/test_atomic.cpp | 3 +- .../perf_test/test_atomic_minmax_simple.cpp | 8 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp | 15 +- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 67 +- .../core/src/Cuda/Kokkos_Cuda_Error.hpp | 47 - .../core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp | 9 +- .../core/src/Cuda/Kokkos_Cuda_Instance.cpp | 8 + .../core/src/Cuda/Kokkos_Cuda_Instance.hpp | 30 +- .../src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 38 +- .../core/src/Cuda/Kokkos_Cuda_abort.hpp | 25 +- lib/kokkos/core/src/HIP/Kokkos_HIP.hpp | 15 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp | 37 - .../core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 4 +- .../core/src/HIP/Kokkos_HIP_Instance.cpp | 16 + .../core/src/HIP/Kokkos_HIP_Instance.hpp | 29 +- .../HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp | 1 + .../HIP/Kokkos_HIP_ParallelReduce_Team.hpp | 3 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp | 17 +- lib/kokkos/core/src/HPX/Kokkos_HPX.cpp | 4 +- lib/kokkos/core/src/HPX/Kokkos_HPX.hpp | 18 +- lib/kokkos/core/src/Kokkos_Array.hpp | 102 +- .../Kokkos_Atomics_Desul_Volatile_Wrapper.hpp | 1 - .../core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 2 - lib/kokkos/core/src/Kokkos_Complex.hpp | 76 + lib/kokkos/core/src/Kokkos_CopyViews.hpp | 527 ++--- lib/kokkos/core/src/Kokkos_ExecPolicy.hpp | 81 +- lib/kokkos/core/src/Kokkos_Extents.hpp | 68 +- lib/kokkos/core/src/Kokkos_Graph.hpp | 3 + lib/kokkos/core/src/Kokkos_HostSpace.hpp | 2 - lib/kokkos/core/src/Kokkos_Layout.hpp | 120 +- lib/kokkos/core/src/Kokkos_Macros.hpp | 53 + .../core/src/Kokkos_MathematicalFunctions.hpp | 8 + lib/kokkos/core/src/Kokkos_Pair.hpp | 22 +- lib/kokkos/core/src/Kokkos_Parallel.hpp | 33 +- .../core/src/Kokkos_Parallel_Reduce.hpp | 77 +- lib/kokkos/core/src/Kokkos_View.hpp | 262 +-- .../core/src/OpenACC/Kokkos_OpenACCSpace.cpp | 11 +- .../Kokkos_OpenACC_ParallelFor_Team.hpp | 8 +- lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp | 21 +- lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp | 18 +- .../src/OpenMP/Kokkos_OpenMP_Instance.cpp | 38 +- .../src/OpenMP/Kokkos_OpenMP_Instance.hpp | 22 +- .../src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp | 12 +- .../OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp | 20 +- .../OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp | 10 +- .../core/src/OpenMP/Kokkos_OpenMP_Task.cpp | 13 +- .../core/src/OpenMP/Kokkos_OpenMP_Task.hpp | 11 +- .../src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 3 +- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp | 6 +- .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 40 - .../Kokkos_OpenMPTarget_Instance.cpp | 1 - .../Kokkos_OpenMPTarget_MDRangePolicy.hpp | 5 + .../Kokkos_OpenMPTarget_Parallel.hpp | 4 - ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 383 ++++ ...s_OpenMPTarget_ParallelReduce_MDRange.hpp} | 631 +++--- ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 9 +- ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 10 +- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 10 +- lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp | 25 + .../src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp | 157 ++ .../src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp | 56 + .../core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp | 174 ++ .../core/src/SYCL/Kokkos_SYCL_Instance.cpp | 34 +- .../core/src/SYCL/Kokkos_SYCL_Instance.hpp | 34 +- .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 26 +- .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 61 +- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 73 +- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 64 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 67 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 134 +- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 223 ++- .../core/src/SYCL/Kokkos_SYCL_Space.cpp | 89 +- lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp | 111 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 103 +- lib/kokkos/core/src/Serial/Kokkos_Serial.cpp | 41 +- lib/kokkos/core/src/Serial/Kokkos_Serial.hpp | 29 +- .../Serial/Kokkos_Serial_Parallel_MDRange.hpp | 17 +- .../Serial/Kokkos_Serial_Parallel_Range.hpp | 28 +- .../Serial/Kokkos_Serial_Parallel_Team.hpp | 16 +- .../core/src/Threads/Kokkos_Threads_Team.hpp | 7 +- lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp | 318 +++ .../View/MDSpan/Kokkos_MDSpan_Accessor.hpp | 220 +++ .../src/View/MDSpan/Kokkos_MDSpan_Extents.hpp | 19 +- .../src/View/MDSpan/Kokkos_MDSpan_Layout.hpp | 156 ++ .../core/src/decl/Kokkos_Declare_SYCL.hpp | 3 + lib/kokkos/core/src/impl/Kokkos_Core.cpp | 37 +- .../src/impl/Kokkos_Default_Graph_Impl.hpp | 7 +- .../Kokkos_DesulAtomicsConfig.hpp} | 12 +- lib/kokkos/core/src/impl/Kokkos_Error.cpp | 121 +- lib/kokkos/core/src/impl/Kokkos_Error.hpp | 107 +- lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp | 23 +- .../core/src/impl/Kokkos_HostThreadTeam.hpp | 23 +- lib/kokkos/core/src/impl/Kokkos_Profiling.cpp | 78 - lib/kokkos/core/src/impl/Kokkos_Profiling.hpp | 59 +- .../src/impl/Kokkos_Profiling_C_Interface.h | 8 + .../core/src/impl/Kokkos_SharedAlloc.cpp | 35 - .../core/src/impl/Kokkos_SharedAlloc.hpp | 86 +- lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp | 622 ------ .../core/src/impl/Kokkos_ViewLayoutTiled.hpp | 1425 ------------- .../core/src/impl/Kokkos_ViewMapping.hpp | 544 ++--- .../core/src/setup/Kokkos_Setup_Cuda.hpp | 2 + .../core/src/setup/Kokkos_Setup_HIP.hpp | 2 + .../core/src/setup/Kokkos_Setup_SYCL.hpp | 17 + lib/kokkos/core/unit_test/CMakeLists.txt | 125 +- lib/kokkos/core/unit_test/Makefile | 22 +- lib/kokkos/core/unit_test/TestAggregate.hpp | 108 - lib/kokkos/core/unit_test/TestArray.cpp | 193 +- lib/kokkos/core/unit_test/TestArrayOps.hpp | 29 + .../core/unit_test/TestAtomicOperations.hpp | 8 +- .../unit_test/TestBitManipulationBuiltins.hpp | 6 - lib/kokkos/core/unit_test/TestComplex.hpp | 166 +- .../unit_test/TestExecSpaceThreadSafety.hpp | 327 +++ .../core/unit_test/TestExecutionSpace.hpp | 56 + lib/kokkos/core/unit_test/TestGraph.hpp | 71 +- .../core/unit_test/TestLocalDeepCopy.hpp | 28 +- lib/kokkos/core/unit_test/TestMDSpan.hpp | 8 +- .../unit_test/TestMDSpanAtomicAccessor.hpp | 112 ++ .../core/unit_test/TestMDSpanConversion.hpp | 507 +++++ .../unit_test/TestMathematicalConstants.hpp | 3 +- .../unit_test/TestMathematicalFunctions.hpp | 80 +- lib/kokkos/core/unit_test/TestMultiGPU.hpp | 184 ++ .../core/unit_test/TestNestedReducerCTAD.cpp | 246 +++ .../core/unit_test/TestNumericTraits.hpp | 105 +- lib/kokkos/core/unit_test/TestOther.hpp | 5 - .../unit_test/TestRangePolicyConstructors.hpp | 40 + lib/kokkos/core/unit_test/TestRealloc.hpp | 13 + lib/kokkos/core/unit_test/TestResize.hpp | 13 + .../core/unit_test/TestSpaceAwareAccessor.hpp | 156 ++ .../TestSpaceAwareAccessorAccessViolation.hpp | 128 ++ .../unit_test/TestTeamMDRangePolicyCTAD.cpp | 199 ++ .../core/unit_test/TestTeamPolicyCTAD.cpp | 135 ++ lib/kokkos/core/unit_test/TestTeamVector.hpp | 7 +- .../core/unit_test/TestTeamVectorRange.hpp | 6 - lib/kokkos/core/unit_test/TestViewAPI.hpp | 98 +- lib/kokkos/core/unit_test/TestViewAPI_c.hpp | 1 + lib/kokkos/core/unit_test/TestViewAPI_d.hpp | 18 - .../core/unit_test/TestViewBadAlloc.hpp | 86 + lib/kokkos/core/unit_test/TestViewCopy_c.hpp | 434 ++++ .../core/unit_test/TestViewLayoutTiled.hpp | 1756 ----------------- lib/kokkos/core/unit_test/TestViewOfViews.hpp | 75 + lib/kokkos/core/unit_test/TestViewSubview.hpp | 5 +- .../core/unit_test/UnitTest_ScopeGuard.cpp | 155 ++ .../category_files/TestHPX_Category.hpp | 1 + .../category_files/TestOpenACC_Category.hpp | 1 + .../TestOpenMPTarget_Category.hpp | 1 + .../category_files/TestSYCL_Category.hpp | 1 + .../category_files/TestThreads_Category.hpp | 1 + .../core/unit_test/cuda/TestCuda_Graph.cpp | 18 - .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 162 +- .../headers_self_contained/CMakeLists.txt | 3 +- .../hip/TestHIP_Memory_Requirements.cpp | 3 - .../incremental/Test01_execspace.hpp | 2 + .../unit_test/openmp/TestOpenMP_Graph.cpp | 18 - .../unit_test/serial/TestSerial_Graph.cpp | 18 - .../sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp | 64 + .../view/TestExtentsDatatypeConversion.cpp | 11 +- lib/kokkos/example/README | 4 +- .../build_cmake_installed/CMakeLists.txt | 1 + .../tutorial/01_hello_world/hello_world.cpp | 19 +- .../hello_world_lambda.cpp | 14 +- .../simple_reduce_lambda.cpp | 5 +- lib/kokkos/generate_makefile.bash | 1 - lib/kokkos/master_history.txt | 1 + lib/kokkos/simd/src/Kokkos_SIMD.hpp | 9 +- lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp | 1011 +++++++++- lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp | 1234 +++++++++++- lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp | 835 +++++++- lib/kokkos/simd/unit_tests/CMakeLists.txt | 12 +- .../unit_tests/include/SIMDTesting_Ops.hpp | 2 + .../include/SIMDTesting_Utilities.hpp | 12 +- .../unit_tests/include/TestSIMD_Condition.hpp | 66 +- .../include/TestSIMD_Conversions.hpp | 142 +- .../include/TestSIMD_GeneratorCtors.hpp | 124 +- .../unit_tests/include/TestSIMD_MaskOps.hpp | 80 +- .../unit_tests/include/TestSIMD_MathOps.hpp | 89 +- .../include/TestSIMD_Reductions.hpp | 40 +- .../unit_tests/include/TestSIMD_ShiftOps.hpp | 86 +- .../include/TestSIMD_WhereExpressions.hpp | 158 +- .../desul/include/desul/atomics/Adapt_HIP.hpp | 77 + .../include/desul/atomics/Atomic_Ref.hpp | 554 +----- .../desul/atomics/Compare_Exchange_HIP.hpp | 145 +- .../include/desul/atomics/Fetch_Op_CUDA.hpp | 54 +- .../desul/atomics/Fetch_Op_Generic.hpp | 92 +- .../include/desul/atomics/Fetch_Op_HIP.hpp | 167 +- .../atomics/Operator_Function_Objects.hpp | 34 +- .../experimental/__p0009_bits/config.hpp | 2 +- .../experimental/__p0009_bits/extents.hpp | 95 +- .../experimental/__p0009_bits/layout_left.hpp | 26 +- .../__p0009_bits/layout_right.hpp | 25 +- .../__p0009_bits/layout_stride.hpp | 185 +- .../experimental/__p0009_bits/macros.hpp | 70 +- .../experimental/__p0009_bits/mdspan.hpp | 4 +- .../experimental/__p0009_bits/utility.hpp | 72 + .../experimental/__p2389_bits/dims.hpp} | 14 +- .../__p2630_bits/submdspan_mapping.hpp | 684 +++++-- .../__p2642_bits/layout_padded.hpp | 536 ++--- .../__p2642_bits/layout_padded_fwd.hpp | 62 +- .../tpls/mdspan/include/mdspan/mdspan.hpp | 1 + 254 files changed, 14302 insertions(+), 9956 deletions(-) create mode 100644 lib/kokkos/CITATION.cff create mode 100644 lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt create mode 100644 lib/kokkos/benchmarks/view_copy_constructor/Makefile create mode 100644 lib/kokkos/benchmarks/view_copy_constructor/view_copy_constructor.cpp delete mode 100644 lib/kokkos/cmake/deps/CUSPARSE.cmake delete mode 100644 lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake create mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp rename lib/kokkos/core/src/OpenMPTarget/{Kokkos_OpenMPTarget_Parallel_MDRange.hpp => Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp} (61%) create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp create mode 100644 lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp create mode 100644 lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Accessor.hpp create mode 100644 lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp rename lib/kokkos/core/src/{Kokkos_Atomics_Desul_Config.hpp => impl/Kokkos_DesulAtomicsConfig.hpp} (72%) delete mode 100644 lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp delete mode 100644 lib/kokkos/core/unit_test/TestAggregate.hpp create mode 100644 lib/kokkos/core/unit_test/TestExecSpaceThreadSafety.hpp create mode 100644 lib/kokkos/core/unit_test/TestMDSpanAtomicAccessor.hpp create mode 100644 lib/kokkos/core/unit_test/TestMDSpanConversion.hpp create mode 100644 lib/kokkos/core/unit_test/TestMultiGPU.hpp create mode 100644 lib/kokkos/core/unit_test/TestNestedReducerCTAD.cpp create mode 100644 lib/kokkos/core/unit_test/TestSpaceAwareAccessor.hpp create mode 100644 lib/kokkos/core/unit_test/TestSpaceAwareAccessorAccessViolation.hpp create mode 100644 lib/kokkos/core/unit_test/TestTeamMDRangePolicyCTAD.cpp create mode 100644 lib/kokkos/core/unit_test/TestTeamPolicyCTAD.cpp create mode 100644 lib/kokkos/core/unit_test/TestViewBadAlloc.hpp create mode 100644 lib/kokkos/core/unit_test/TestViewCopy_c.hpp delete mode 100644 lib/kokkos/core/unit_test/TestViewLayoutTiled.hpp create mode 100644 lib/kokkos/core/unit_test/TestViewOfViews.hpp create mode 100644 lib/kokkos/core/unit_test/UnitTest_ScopeGuard.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Graph.cpp create mode 100644 lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp create mode 100644 lib/kokkos/tpls/desul/include/desul/atomics/Adapt_HIP.hpp create mode 100644 lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/utility.hpp rename lib/kokkos/{core/unit_test/hip/TestHIP_Graph.cpp => tpls/mdspan/include/experimental/__p2389_bits/dims.hpp} (59%) diff --git a/lib/kokkos/ b/lib/kokkos/ index 4fbc9002973..78225f9e6c2 100644 --- a/lib/kokkos/ +++ b/lib/kokkos/ @@ -1,12 +1,88 @@ # CHANGELOG +## [4.4.00]( +[Full Changelog]( + +### Features: +* Add `Kokkos::View` conversions from and to [`std::mdspan`]( [\#6830]( [\#7069]( + +### Backend and Architecture Enhancements: + +#### CUDA: +* `nvcc_wrapper`: Adding ability to process `--disable-warnings` flag [\#6936]( +* Use recommended/max team size functions in Cuda ParallelFor and Reduce constructors [\#6891]( +* Improve compile-times when building with `Kokkos_ENABLE_DEBUG_BOUNDS_CHECK` in Cuda [\#7013]( + +#### HIP: +* Use HIP builtin atomics [\#6882]( [\#7000]( +* Enable user-specified compiler and linker flags for AMD GPUs [\#7127]( + +#### SYCL: +* Add support for Graphs [\#6912]( +* Fix multi-GPU support [\#6887]( +* Improve performance of reduction and scan operations [\#6562](, [\#6750]( +* Fix lock for guarding scratch space in `TeamPolicy` `parallel_reduce` [\#6988]( +* Include submission command queue property information into `SYCL::print_configuration()` [\#7004]( + +#### OpenACC: +* Make `TeamPolicy` `parallel_for` execute on the correct async queue [\#7012]( + +#### OpenMPTarget: +* Honor user requested loop ordering in `MDRange` policy [\#6925]( +* Prevent data races by guarding the scratch space used in `parallel_scan` [\#6998]( + +#### HPX: +* Workaround issue with template argument deduction to support compilation with NVCC [\#7015]( + +### General Enhancements +* Improve performance of view copies in host parallel regions [\#6730]( +* Harmonize convertibility rules of `Kokkos::RandomAccessIterator` with `View`s [\#6929]( +* Add a check precondition non-overlapping ranges for the `adjacent_difference` algorithm in debug mode [\#6922]( +* Add deduction guides for `TeamPolicy` [\#7030]( +* SIMD: Allow flexible vector width for 32 bit types [\#6802]( +* Updates for `Kokkos::Array`: add `kokkos_swap(Array)` specialization [\#6943](, add `Kokkos::to_array` [\#6375](, make `Kokkos::Array` equality-comparable [\#7148]( +* Structured binding support for `Kokkos::complex` [\#7040]( + +### Build System Changes +* Do not require OpenMP support for languages other than CXX [\#6965]( +* Update Intel GPU architectures in Makefile [\#6895]( +* Fix use of OpenMP with Cuda or HIP as compile language [\#6972]( +* Define and enforce new minimum compiler versions for C++20 support [\#7128](, [\#7123]( +* Add nvidia Grace CPU architecture: `Kokkos_ARCH_ARMV9_GRACE` [\#7158]( +* Fix Makefile.kokkos for Threads [\#6896]( +* Remove support for NVHPC as CUDA device compiler [\#6987]( +* Fix using CUDAToolkit for CMake 3.28.4 and higher [\#7062]( + +### Incompatibilities (i.e. breaking changes) +* Drop `Kokkos::Array` special treatment in `View`s [\#6906]( +* Drop `Experimental::RawMemoryAllocationFailure` [\#7145]( + +### Deprecations +* Remove `Experimental::LayoutTiled` class template and deprecate `is_layouttiled` trait [\#6907]( +* Deprecate `Kokkos::layout_iterate_type_selector` [\#7076]( +* Deprecate specialization of `Kokkos::pair` for a single element [\#6947]( +* Deprecate `deep_copy` of `UnorderedMap` of different size [\#6812]( +* Deprecate trailing `Proxy` template argument of `Kokkos::Array` [\#6934]( +* Deprecate implicit conversions of integers to `ChunkSize` [\#7151]( +* Deprecate implicit conversions to execution spaces [\#7156]( + +### Bug Fixes +* Do not return a copy of the input functor in `Experimental::for_each` [\#6910]( +* Fix `realloc` on views of non-default constructible element types [\#6993]( +* Fix undefined behavior in `View` initialization or fill with zeros [\#7014]( +* Fix `sort_by_key` on host execution spaces when building with NVCC [\#7059]( +* Fix using shared libraries and -fvisibility=hidden [\#7065]( +* Fix view reference counting when functor copy constructor throws in parallel dispatch [\#6289]( +* Fix `initialize(InitializationSetting)` for handling `print_configuration` setting [\#7098]( +* Thread safety fixes for the Serial and OpenMP backend [\#7080](, [\#6151]( + ## [4.3.01]( [Full Changelog]( ### Backend and Architecture Enhancements: #### HIP: -* MI300 support unified memory support [\#6877]( +* MI300 support unified memory [\#6877]( ### Bug Fixes * Serial: Use the provided execution space instance in TeamPolicy [\#6951]( diff --git a/lib/kokkos/CITATION.cff b/lib/kokkos/CITATION.cff new file mode 100644 index 00000000000..28c674c451b --- /dev/null +++ b/lib/kokkos/CITATION.cff @@ -0,0 +1,65 @@ +cff-version: 1.2.0 +title: Kokkos +message: >- + If you use this software, please cite the overview paper +type: software +authors: + - name: The Kokkos authors + website: +identifiers: + - type: url + website: +repository-code: '' +url: '' +license: Apache-2.0 +preferred-citation: + type: article + authors: + - given-names: Christian R. + family-names: Trott + - given-names: Damien + family-names: Lebrun-GrandiƩ + - given-names: Daniel + family-names: Arndt + - family-names: Ciesko + given-names: Jan + - given-names: Vinh + family-names: Dang + - family-names: Ellingwood + given-names: Nathan + - given-names: Rahulkumar + family-names: Gayatri + - given-names: Evan + family-names: Harvey + - given-names: Daisy S. + family-names: Hollman + - given-names: Dan + family-names: Ibanez + - given-names: Nevin + family-names: Liber + - given-names: Jonathan + family-names: Madsen + - given-names: Jeff + family-names: Miles + - given-names: David + family-names: Poliakoff + - given-names: Amy + family-names: Powell + - given-names: Sivasankaran + family-names: Rajamanickam + - given-names: Mikael + family-names: Simberg + - given-names: Dan + family-names: Sunderland + - given-names: Bruno + family-names: Turcksin + - given-names: Jeremiah + family-names: Wilke + doi: 10.1109/TPDS.2021.3097283 + journal: IEEE Transactions on Parallel and Distributed Systems + start: 805 + end: 817 + title: "Kokkos 3: Programming Model Extensions for the Exascale Era" + volume: 33 + issue: 4 + year: 2022 diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 76f2183db8a..054de2c1dae 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -150,8 +150,8 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 3) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 4) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 6fdddd9a530..a8e1e803f45 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -11,8 +11,8 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 3 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 4 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -21,11 +21,11 @@ KOKKOS_DEVICES ?= "OpenMP" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 -# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX +# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX,ARMv9-Grace # IBM: Power8,Power9 -# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100,AMD_GFX1103 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 -# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC +# Intel-GPUs: Intel_Gen,Intel_Gen9,Intel_Gen11,Intel_Gen12LP,Intel_DG1,Intel_XeHP,Intel_PVC KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" @@ -41,7 +41,7 @@ KOKKOS_STANDALONE_CMAKE ?= "no" # Default settings specific options. # Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async -KOKKOS_CUDA_OPTIONS ?= "enable_lambda" +KOKKOS_CUDA_OPTIONS ?= "enable_lambda,disable_malloc_async" # Options: rdc KOKKOS_HIP_OPTIONS ?= "" @@ -328,12 +328,43 @@ KOKKOS_INTERNAL_USE_ARCH_ICL := $(call kokkos_has_string,$(KOKKOS_ARCH),ICL) KOKKOS_INTERNAL_USE_ARCH_ICX := $(call kokkos_has_string,$(KOKKOS_ARCH),ICX) KOKKOS_INTERNAL_USE_ARCH_SPR := $(call kokkos_has_string,$(KOKKOS_ARCH),SPR) -KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) +# Traditionally, we supported, e.g., IntelGen9 instead of Intel_Gen9. The latter +# matches the CMake option but we also accept the former for backward-compatibility. KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen11) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen12LP) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP)) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen) + endif +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_DG1) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_XeHP) +endif +# Traditionally the architecture was called PVC instead of Intel_PVC. This +# version makes us accept IntelPVC and Intel_PVC as well. KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC := $(call kokkos_has_string,$(KOKKOS_ARCH),PVC) # NVIDIA based. @@ -394,7 +425,8 @@ KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8 KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX) KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-TX2) KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) -KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) +KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv9-Grace) +KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE) | bc)) # IBM based. KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8) @@ -433,7 +465,6 @@ KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH), ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) endif -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1103) # Any AVX? KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) @@ -758,6 +789,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV9_GRACE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") + + KOKKOS_CXXFLAGS += -mcpu=neoverse-v2 -msve-vector-bits=128 + KOKKOS_LDFLAGS += -mcpu=neoverse-v2 -msve-vector-bits=128 +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2") @@ -1119,11 +1158,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1103") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1103 -endif ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) @@ -1216,6 +1250,8 @@ ifeq ($(KOKKOS_INTERNAL_DISABLE_BUNDLED_MDSPAN), 0) endif tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_MDSPAN") +tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY") + KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index e6900a822a8..e8e429e0275 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -81,7 +81,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) diff --git a/lib/kokkos/ b/lib/kokkos/ index 19793bb82d9..c8c6f8f7cf5 100644 --- a/lib/kokkos/ +++ b/lib/kokkos/ @@ -1,4 +1,4 @@ -![Kokkos]( +[![Kokkos](]( # Kokkos: Core Libraries @@ -10,43 +10,66 @@ hierarchies and multiple types of execution resources. It currently can use CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other backends in development. -**Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem.** +**Kokkos Core is part of the [Kokkos C++ Performance Portability Programming Ecosystem](** -For the complete documentation, click below: +Kokkos is a [Linux Foundation]( project. -# []( - -# Learning about Kokkos +## Learning about Kokkos To start learning about Kokkos: -- [Kokkos Lectures]( they contain a mix of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem capabilities. +- [Kokkos Lectures]( they contain a mix of lecture videos and hands-on exercises covering all the important capabilities. -- [Programming guide]( contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. +- [Programming guide]( contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. -- [API reference]( organized by category, i.e., [core](, [algorithms]( and [containers]( or, if you prefer, in [alphabetical order]( +- [API reference]( organized by category, i.e., [core](, [algorithms]( and [containers]( or, if you prefer, in [alphabetical order]( -- [Use cases and Examples]( a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability. +- [Use cases and Examples]( a serie of examples ranging from how to use Kokkos with MPI to Fortran interoperability. -For questions find us on Slack: or open a GitHub issue. +## Obtaining Kokkos -For non-public questions send an email to: *crtrott(at)* +The latest release of Kokkos can be obtained from the [GitHub releases page]( + +The current release is [4.3.01]( + +```bash +curl -OJ -L +# Or with wget +wget +``` + +To clone the latest development version of Kokkos from GitHub: + +```bash +git clone -b develop +``` -# Contributing to Kokkos +### Building Kokkos -Please see [this page]( for details on how to contribute. +To build Kokkos, you will need to have a C++ compiler that supports C++17 or later. +All requirements including minimum and primary tested compiler versions can be found [here]( -# Requirements, Building and Installing +Building and installation instructions are described [here]( + +You can also install Kokkos using [Spack]( `spack install kokkos`. [Available configuration options]( can be displayed using `spack info kokkos`. + +## For the complete documentation: []( + +## Support + +For questions find us on Slack: or open a GitHub issue. + +For non-public questions send an email to: *crtrott(at)* -All requirements including minimum and primary tested compiler versions can be found [here]( +## Contributing -Building and installation instructions are described [here]( +Please see [this page]( for details on how to contribute. -# Citing Kokkos +## Citing Kokkos -Please see the [following page]( +Please see the [following page]( -# License +## License [![License](]( diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp index 36deccdfb1e..f11f8070484 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -189,6 +189,33 @@ void applyPermutation(const ExecutionSpace& space, KOKKOS_LAMBDA(int i) { view(i) = view_copy(permutation(i)); }); } +// FIXME_NVCC: nvcc has trouble compiling lambdas inside a function with +// variadic templates (sort_by_key_via_sort). Switch to using functors instead. +template +struct IotaFunctor { + Permute _permute; + KOKKOS_FUNCTION void operator()(int i) const { _permute(i) = i; } +}; +template +struct LessFunctor { + Keys _keys; + KOKKOS_FUNCTION bool operator()(int i, int j) const { + return _keys(i) < _keys(j); + } +}; + +// FIXME_NVCC+MSVC: We can't use a lambda instead of a functor which gave us +// "For this host platform/dialect, an extended lambda cannot be defined inside +// the 'if' or 'else' block of a constexpr if statement" +template +struct KeyComparisonFunctor { + Keys m_keys; + Comparator m_comparator; + KOKKOS_FUNCTION bool operator()(int i, int j) const { + return m_comparator(m_keys(i), m_keys(j)); + } +}; + template @@ -207,10 +234,9 @@ void sort_by_key_via_sort( n); // iota - Kokkos::parallel_for( - "Kokkos::sort_by_key_via_sort::iota", - Kokkos::RangePolicy(exec, 0, n), - KOKKOS_LAMBDA(int i) { permute(i) = i; }); + Kokkos::parallel_for("Kokkos::sort_by_key_via_sort::iota", + Kokkos::RangePolicy(exec, 0, n), + IotaFunctor{permute}); using Layout = typename Kokkos::View::array_layout; @@ -228,16 +254,15 @@ void sort_by_key_via_sort( Kokkos::DefaultHostExecutionSpace host_exec; if constexpr (sizeof...(MaybeComparator) == 0) { - Kokkos::sort( - host_exec, host_permute, - KOKKOS_LAMBDA(int i, int j) { return host_keys(i) < host_keys(j); }); + Kokkos::sort(host_exec, host_permute, + LessFunctor{host_keys}); } else { auto keys_comparator = std::get<0>(std::tuple(maybeComparator...)); Kokkos::sort( - host_exec, host_permute, KOKKOS_LAMBDA(int i, int j) { - return keys_comparator(host_keys(i), host_keys(j)); - }); + host_exec, host_permute, + KeyComparisonFunctor{ + host_keys, keys_comparator}); } host_exec.fence("Kokkos::Impl::sort_by_key_via_sort: after host sort"); Kokkos::deep_copy(exec, permute, host_permute); @@ -262,16 +287,14 @@ void sort_by_key_via_sort( } #else if constexpr (sizeof...(MaybeComparator) == 0) { - Kokkos::sort( - exec, permute, - KOKKOS_LAMBDA(int i, int j) { return keys(i) < keys(j); }); + Kokkos::sort(exec, permute, LessFunctor{keys}); } else { auto keys_comparator = std::get<0>(std::tuple(maybeComparator...)); Kokkos::sort( - exec, permute, KOKKOS_LAMBDA(int i, int j) { - return keys_comparator(keys(i), keys(j)); - }); + exec, permute, + KeyComparisonFunctor{ + keys, keys_comparator}); } #endif } diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp index 6215b325afc..05969be463a 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp @@ -29,49 +29,46 @@ namespace Experimental { template < class ExecutionSpace, class IteratorType, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { - return Impl::for_each_exespace_impl(label, ex, first, last, - std::move(functor)); +void for_each(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, UnaryFunctorType functor) { + Impl::for_each_exespace_impl(label, ex, first, last, std::move(functor)); } template < class ExecutionSpace, class IteratorType, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const ExecutionSpace& ex, IteratorType first, - IteratorType last, UnaryFunctorType functor) { - return Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default", - ex, first, last, std::move(functor)); +void for_each(const ExecutionSpace& ex, IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default", ex, + first, last, std::move(functor)); } template < class ExecutionSpace, class DataType, class... Properties, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +void for_each(const std::string& label, const ExecutionSpace& ex, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v), + std::move(functor)); } template < class ExecutionSpace, class DataType, class... Properties, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const ExecutionSpace& ex, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +void for_each(const ExecutionSpace& ex, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex, - KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex, + KE::begin(v), KE::end(v), std::move(functor)); } // @@ -82,24 +79,23 @@ UnaryFunctorType for_each(const ExecutionSpace& ex, template , int> = 0> -KOKKOS_FUNCTION UnaryFunctorType for_each(const TeamHandleType& teamHandle, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { - return Impl::for_each_team_impl(teamHandle, first, last, std::move(functor)); +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_team_impl(teamHandle, first, last, std::move(functor)); } template , int> = 0> -KOKKOS_FUNCTION UnaryFunctorType -for_each(const TeamHandleType& teamHandle, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v), + std::move(functor)); } } // namespace Experimental diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index a8171fa068d..9f7fcf94fe0 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -82,6 +82,11 @@ OutputIteratorType adjacent_difference_exespace_impl( return first_dest; } +#ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators + Impl::expect_no_overlap(first_from, last_from, first_dest); +#endif + // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); @@ -114,6 +119,11 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( return first_dest; } +#ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators + Impl::expect_no_overlap(first_from, last_from, first_dest); +#endif + // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 27ce5a6fad6..54bb13e25b9 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -24,18 +24,21 @@ namespace Kokkos { namespace Experimental { namespace Impl { +template +class RandomAccessIterator; + template struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< - T, std::enable_if_t< ::Kokkos::is_view::value && T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value)> > + T, std::enable_if_t<::Kokkos::is_view::value && T::rank() == 1 && + (std::is_same::value || + std::is_same::value || + std::is_same::value)>> : std::true_type {}; template @@ -58,6 +61,18 @@ using is_iterator = Kokkos::is_detected; template inline constexpr bool is_iterator_v = is_iterator::value; +template +struct is_kokkos_iterator : std::false_type {}; + +template +struct is_kokkos_iterator> { + static constexpr bool value = + is_admissible_to_kokkos_std_algorithms::value; +}; + +template +inline constexpr bool is_kokkos_iterator_v = is_kokkos_iterator::value; + // // are_iterators // @@ -215,6 +230,38 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, (void)last; } +// +// Check if kokkos iterators are overlapping +// +template +KOKKOS_INLINE_FUNCTION void expect_no_overlap( + [[maybe_unused]] IteratorType1 first, [[maybe_unused]] IteratorType1 last, + [[maybe_unused]] IteratorType2 s_first) { + if constexpr (is_kokkos_iterator_v && + is_kokkos_iterator_v) { + auto const view1 = first.view(); + auto const view2 = s_first.view(); + + std::size_t stride1 = view1.stride(0); + std::size_t stride2 = view2.stride(0); + ptrdiff_t first_diff = -; + + // FIXME If strides are not identical, checks may not be made + // with the cost of O(1) + // Currently, checks are made only if strides are identical + // If first_diff == 0, there is already an overlap + if (stride1 == stride2 || first_diff == 0) { + [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); + auto* first_pointer1 =; + auto* first_pointer2 =; + [[maybe_unused]] auto* last_pointer1 = first_pointer1 + (last - first); + [[maybe_unused]] auto* last_pointer2 = first_pointer2 + (last - first); + KOKKOS_EXPECTS(first_pointer1 >= last_pointer2 || + last_pointer1 <= first_pointer2 || is_no_overlap); + } + } +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp index 3c1e2474bc9..ad7b8bb8cab 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp @@ -150,8 +150,9 @@ KOKKOS_FUNCTION OutputIterator copy_if_team_impl( return d_first + count; } -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) __builtin_unreachable(); #endif } diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp index d3be3b7f667..99cc4a1cf3a 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp @@ -42,10 +42,9 @@ struct StdForEachFunctor { }; template -UnaryFunctorType for_each_exespace_impl(const std::string& label, - const HandleType& handle, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { +void for_each_exespace_impl(const std::string& label, const HandleType& handle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { // checks Impl::static_assert_random_access_and_accessible(handle, first); Impl::expect_valid_range(first, last); @@ -56,8 +55,6 @@ UnaryFunctorType for_each_exespace_impl(const std::string& label, label, RangePolicy(handle, 0, num_elements), StdForEachFunctor(first, functor)); handle.fence("Kokkos::for_each: fence after operation"); - - return functor; } template -KOKKOS_FUNCTION UnaryFunctorType -for_each_team_impl(const TeamHandleType& teamHandle, IteratorType first, - IteratorType last, UnaryFunctorType functor) { +KOKKOS_FUNCTION void for_each_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { // checks Impl::static_assert_random_access_and_accessible(teamHandle, first); Impl::expect_valid_range(first, last); @@ -96,7 +93,6 @@ for_each_team_impl(const TeamHandleType& teamHandle, IteratorType first, TeamThreadRange(teamHandle, 0, num_elements), StdForEachFunctor(first, functor)); teamHandle.team_barrier(); - return functor; } template > { ptrdiff_t current_index) : m_view(view), m_current_index(current_index) {} +#ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond + template + requires(std::is_constructible_v) KOKKOS_FUNCTION + explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#else + template < + class OtherViewType, + std::enable_if_t && + !std::is_convertible_v, + int> = 0> + KOKKOS_FUNCTION explicit RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} + + template , + int> = 0> + KOKKOS_FUNCTION RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#endif + KOKKOS_FUNCTION iterator_type& operator++() { ++m_current_index; @@ -152,9 +176,16 @@ class RandomAccessIterator< ::Kokkos::View > { KOKKOS_FUNCTION reference operator*() const { return m_view(m_current_index); } + KOKKOS_FUNCTION + view_type view() const { return m_view; } + private: view_type m_view; ptrdiff_t m_current_index = 0; + + // Needed for the converting constructor accepting another iterator + template + friend class RandomAccessIterator; }; } // namespace Impl diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp index c7c29302786..710d04805d8 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp @@ -175,8 +175,9 @@ KOKKOS_FUNCTION OutputIterator unique_copy_team_impl( d_first + count); } -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) __builtin_unreachable(); #endif } diff --git a/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp b/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp index 282d85548c5..7d484136b6d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -46,6 +46,44 @@ TEST_F(random_access_iterator_test, constructor) { EXPECT_TRUE(true); } +TEST_F(random_access_iterator_test, constructiblity) { + auto first_d = KE::begin(m_dynamic_view); + auto cfirst_d = KE::cbegin(m_dynamic_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_d) tmp_cfirst_d(first_d); + + auto first_s = KE::begin(m_static_view); + auto cfirst_s = KE::cbegin(m_static_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_s) tmp_cfirst_s(first_s); + + auto first_st = KE::begin(m_strided_view); + auto cfirst_st = KE::cbegin(m_strided_view); + + static_assert( + std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_st) tmp_cfirst_st(first_st); + + // [FIXME] Better to have tests for the explicit specifier with an expression. + // As soon as View converting constructors are re-implemented with a + // conditional explicit, we may add those tests. + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + EXPECT_TRUE(true); +} + template void test_random_access_it_verify(IteratorType it, ValueType gold_value) { using view_t = Kokkos::View; diff --git a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp index 16f68eaaf26..9e5bd4a5748 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp @@ -69,7 +69,7 @@ void iota(ExecutionSpace const &space, ViewType const &v, typename ViewType::value_type value = 0) { using ValueType = typename ViewType::value_type; Kokkos::parallel_for( - "ArborX::Algorithms::iota", + "Kokkos::Algorithms::iota", Kokkos::RangePolicy(space, 0, v.extent(0)), KOKKOS_LAMBDA(int i) { v(i) = value + (ValueType)i; }); } @@ -87,6 +87,18 @@ TEST(TEST_CATEGORY, SortByKeyEmptyView) { Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); } +// Test #7036 +TEST(TEST_CATEGORY, SortByKeyEmptyViewHost) { + using ExecutionSpace = Kokkos::DefaultHostExecutionSpace; + + // does not matter if we use int or something else + Kokkos::View keys("keys", 0); + Kokkos::View values("values", 0); + + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); +} + TEST(TEST_CATEGORY, SortByKey) { using ExecutionSpace = TEST_EXECSPACE; using MemorySpace = typename ExecutionSpace::memory_space; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 386d533f7a8..2a4525a8c33 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -81,5 +81,114 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { strided_view_3d_t>::value); } +TEST(std_algorithms, expect_no_overlap) { + namespace KE = Kokkos::Experimental; + using value_type = double; + + static constexpr size_t extent0 = 13; + + //------------- + // 1d views + //------------- + using static_view_1d_t = Kokkos::View; + [[maybe_unused]] static_view_1d_t static_view_1d{ + "std-algo-test-1d-contiguous-view-static"}; + + using dyn_view_1d_t = Kokkos::View; + [[maybe_unused]] dyn_view_1d_t dynamic_view_1d{ + "std-algo-test-1d-contiguous-view-dynamic", extent0}; + + using strided_view_1d_t = Kokkos::View; + Kokkos::LayoutStride layout1d{extent0, 2}; + strided_view_1d_t strided_view_1d{"std-algo-test-1d-strided-view", layout1d}; + +// Overlapping because iterators are identical +#if defined(KOKKOS_ENABLE_DEBUG) + auto first_s = KE::begin(static_view_1d); + auto last_s = first_s + extent0; + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, + "Kokkos contract violation:.*"); + + auto first_d = KE::begin(dynamic_view_1d); + auto last_d = first_d + extent0; + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_d, last_d, first_d); }, + "Kokkos contract violation:.*"); + + auto first_st = KE::begin(strided_view_1d); + auto last_st = first_st + extent0; + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_st, last_st, first_st); }, + "Kokkos contract violation:.*"); +#endif + + // Ranges are overlapped + static constexpr size_t sub_extent0 = 6, offset0 = 3; + std::pair range0(0, sub_extent0), + range1(offset0, offset0 + sub_extent0); +#if defined(KOKKOS_ENABLE_DEBUG) + auto static_view_1d_0 = Kokkos::subview(static_view_1d, range0); + auto static_view_1d_1 = Kokkos::subview(static_view_1d, range1); + auto first_s0 = KE::begin(static_view_1d_0); // [0, 6) + auto last_s0 = first_s0 + static_view_1d_0.extent(0); + auto first_s1 = KE::begin(static_view_1d_1); // [3, 9) + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s0, last_s0, first_s1); }, + "Kokkos contract violation:.*"); + + auto dynamic_view_1d_0 = Kokkos::subview(dynamic_view_1d, range0); + auto dynamic_view_1d_1 = Kokkos::subview(dynamic_view_1d, range1); + auto first_d0 = KE::begin(dynamic_view_1d_0); // [0, 6) + auto last_d0 = first_d0 + dynamic_view_1d_0.extent(0); + auto first_d1 = KE::begin(dynamic_view_1d_1); // [3, 9) + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_d0, last_d0, first_d1); }, + "Kokkos contract violation:.*"); +#endif + + auto strided_view_1d_0 = Kokkos::subview(strided_view_1d, range0); + auto strided_view_1d_1 = Kokkos::subview(strided_view_1d, range1); + auto first_st0 = KE::begin(strided_view_1d_0); // [0, 12) + auto last_st0 = first_st0 + strided_view_1d_0.extent(0); + auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15) + // Does not overlap since offset (=3) is not divisible by stride (=2) + EXPECT_NO_THROW( + { KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); }); + + // Iterating over the same range without overlapping + Kokkos::View static_view_2d{ + "std-algo-test-2d-contiguous-view-static"}; + auto sub_static_view_1d_0 = Kokkos::subview(static_view_2d, 0, Kokkos::ALL); + auto sub_static_view_1d_1 = Kokkos::subview(static_view_2d, 1, Kokkos::ALL); + auto sub_first_s0 = KE::begin(sub_static_view_1d_0); // 0, 2, 4, ... + auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); + auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... + + EXPECT_NO_THROW({ + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); + }); + + Kokkos::View dynamic_view_2d{ + "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; + auto sub_dynamic_view_1d_0 = Kokkos::subview(dynamic_view_2d, 0, Kokkos::ALL); + auto sub_dynamic_view_1d_1 = Kokkos::subview(dynamic_view_2d, 1, Kokkos::ALL); + auto sub_first_d0 = KE::begin(sub_dynamic_view_1d_0); // 0, 2, 4, ... + auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); + auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... + + EXPECT_NO_THROW({ + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); + }); + + Kokkos::LayoutStride layout2d{2, 3, extent0, 2 * 3}; + Kokkos::View strided_view_2d{ + "std-algo-test-2d-contiguous-view-strided", layout2d}; + auto sub_strided_view_1d_0 = Kokkos::subview(strided_view_2d, 0, Kokkos::ALL); + auto sub_strided_view_1d_1 = Kokkos::subview(strided_view_2d, 1, Kokkos::ALL); + auto sub_first_st0 = KE::begin(sub_strided_view_1d_0); // 0, 6, 12, ... + auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); + auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... + + EXPECT_NO_THROW({ + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); + }); +} + } // namespace stdalgos } // namespace Test diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index 2c8fee02f47..7cb9851087a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -85,7 +85,7 @@ struct TestFunctorA { break; } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET case 2: { auto it = KE::exclusive_scan( @@ -213,7 +213,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { break; } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET case 2: case 3: { auto it = exclusive_scan(KE::cbegin(rowFrom), KE::cend(rowFrom), @@ -242,7 +242,7 @@ template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET for (int apiId : {0, 1, 2, 3}) { #else for (int apiId : {0, 1}) { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp index f9adeb0654b..850e80dde1e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp @@ -52,7 +52,7 @@ struct TestFunctorA { Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; result = KE::is_sorted(member, KE::cbegin(myRowView), KE::cend(myRowView), @@ -179,7 +179,7 @@ template void run_all_scenarios(bool makeDataSortedOnPurpose) { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5153}) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET for (int apiId : {0, 1, 2, 3}) { #else for (int apiId : {0, 1}) { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp index 33af5f99def..e3b95527c77 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp @@ -73,7 +73,7 @@ struct TestFunctorA { m_distancesView(myRowIndex) = resultDist; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), @@ -226,7 +226,7 @@ template void run_all_scenarios(const std::string& name, const std::vector& cols) { for (int numTeams : teamSizesToTest) { for (const auto& numCols : cols) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET for (int apiId : {0, 1, 2, 3}) { #else for (int apiId : {0, 1}) { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp index fb891a8780f..283525dbd10 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp @@ -59,7 +59,7 @@ struct TestFunctorA { m_distancesView(myRowIndex) = resultDist; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = @@ -170,7 +170,7 @@ void run_all_scenarios() { } TEST(std_algorithms_max_element_team_test, test) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp index 4ba1b6f968b..8579b48315d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp @@ -59,7 +59,7 @@ struct TestFunctorA { m_distancesView(myRowIndex) = resultDist; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = @@ -169,7 +169,7 @@ void run_all_scenarios() { } TEST(std_algorithms_min_element_team_test, test) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp index 17562a55727..51010fdff59 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp @@ -66,7 +66,7 @@ struct TestFunctorA { m_distancesView(myRowIndex, 1) = resultDist2; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto itPair = @@ -188,7 +188,7 @@ void run_all_scenarios() { } TEST(std_algorithms_minmax_element_team_test, test) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp index 94c2a8f1f9a..eb00d9e083a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp @@ -16,7 +16,7 @@ #include -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET namespace Test { namespace stdalgos { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp index 60fa369af18..1c438543819 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp @@ -16,7 +16,7 @@ #include -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET namespace Test { namespace stdalgos { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 10454d65515..0b0d798fd80 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -16,7 +16,7 @@ #include -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET namespace Test { namespace stdalgos { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp index b0a3241ec4b..17ded226aae 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp @@ -16,7 +16,7 @@ #include -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET namespace Test { namespace stdalgos { diff --git a/lib/kokkos/appveyor.yml b/lib/kokkos/appveyor.yml index c0b6e9cab9f..d0a5645ef7b 100644 --- a/lib/kokkos/appveyor.yml +++ b/lib/kokkos/appveyor.yml @@ -5,6 +5,6 @@ build_script: - cmd: >- mkdir build && cd build && - cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF && + cmake c:\projects\source -DKokkos_ENABLE_IMPL_MDSPAN=OFF -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF && cmake --build . --target install && ctest -C Debug --output-on-failure diff --git a/lib/kokkos/benchmarks/CMakeLists.txt b/lib/kokkos/benchmarks/CMakeLists.txt index abf50283594..529ef393d99 100644 --- a/lib/kokkos/benchmarks/CMakeLists.txt +++ b/lib/kokkos/benchmarks/CMakeLists.txt @@ -4,7 +4,7 @@ KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) - +KOKKOS_ADD_BENCHMARK_DIRECTORIES(view_copy_constructor) #FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. IF(NOT Kokkos_ENABLE_OPENMPTARGET) KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) diff --git a/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt b/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt new file mode 100644 index 00000000000..50a331b2b35 --- /dev/null +++ b/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + view_copy_constructor + SOURCES view_copy_constructor.cpp +) diff --git a/lib/kokkos/benchmarks/view_copy_constructor/Makefile b/lib/kokkos/benchmarks/view_copy_constructor/Makefile new file mode 100644 index 00000000000..70c6d517e0d --- /dev/null +++ b/lib/kokkos/benchmarks/view_copy_constructor/Makefile @@ -0,0 +1,46 @@ +KOKKOS_DEVICES=Serial +KOKKOS_ARCH = "" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +CXX = clang++ +EXE = view_copy_constructor.exe + +CXXFLAGS ?= -Ofast +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = -Ofast +KOKKOS_CXX_STANDARD=c++20 + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o view_copy_constructor.cuda view_copy_constructor.exe + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/view_copy_constructor/view_copy_constructor.cpp b/lib/kokkos/benchmarks/view_copy_constructor/view_copy_constructor.cpp new file mode 100644 index 00000000000..63c49f09c01 --- /dev/null +++ b/lib/kokkos/benchmarks/view_copy_constructor/view_copy_constructor.cpp @@ -0,0 +1,310 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +// The function "test_view_collection" exposes the copy constructor +// and destructor overheads in Kokkos View objects +// Please see the lines marked by "NOTE". + +#include +#include +#include +#include +#include +#include +#include + +// NVIEWS is the number of Kokkos View objects in our ViewCollection object +// We have chosen a large value of 40 to make it easier to see performance +// differences when using the likelihood attribute +#define NVIEWS 40 + +class ViewCollection { + public: + Kokkos::View v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, + v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40; + double m_expected_sum; + double m_side_effect; + int m_N; + + ViewCollection(int N) + : v1("v1", N), + v2("v2", N), + v3("v3", N), + v4("v4", N), + v5("v5", N), + v6("v6", N), + v7("v7", N), + v8("v8", N), + v9("v9", N), + v10("v10", N), + v11("v11", N), + v12("v12", N), + v13("v13", N), + v14("v14", N), + v15("v15", N), + v16("v16", N), + v17("v17", N), + v18("v18", N), + v19("v19", N), + v20("v20", N), + v21("v21", N), + v22("v22", N), + v23("v23", N), + v24("v24", N), + v25("v25", N), + v26("v26", N), + v27("v27", N), + v28("v28", N), + v29("v29", N), + v30("v30", N), + v31("v31", N), + v32("v32", N), + v33("v33", N), + v34("v34", N), + v35("v35", N), + v36("v36", N), + v37("v37", N), + v38("v38", N), + v39("v39", N), + v40("v40", N), + m_expected_sum(N * NVIEWS), + m_side_effect(0.0), + m_N(N) { + for (int i = 0; i < N; ++i) { + v1(i) = 1; + v2(i) = 1; + v3(i) = 1; + v4(i) = 1; + v5(i) = 1; + v6(i) = 1; + v7(i) = 1; + v8(i) = 1; + v9(i) = 1; + v10(i) = 1; + v11(i) = 1; + v12(i) = 1; + v13(i) = 1; + v14(i) = 1; + v15(i) = 1; + v16(i) = 1; + v17(i) = 1; + v18(i) = 1; + v19(i) = 1; + v20(i) = 1; + v21(i) = 1; + v22(i) = 1; + v23(i) = 1; + v24(i) = 1; + v25(i) = 1; + v26(i) = 1; + v27(i) = 1; + v28(i) = 1; + v29(i) = 1; + v30(i) = 1; + v31(i) = 1; + v32(i) = 1; + v33(i) = 1; + v34(i) = 1; + v35(i) = 1; + v36(i) = 1; + v37(i) = 1; + v38(i) = 1; + v39(i) = 1; + v40(i) = 1; + } + } + +// The ADD_COPY_CONSTRUCTOR macro is helpful to compare time in the copy +// constructor between compilers. We have found that the GNU compiler +// is sometimes able to inline the default copy constructor. +#ifdef ADD_COPY_CONSTRUCTOR + __attribute__((noinline)) ViewCollection(const ViewCollection& other) + : v1(other.v1), + v2(other.v2), + v3(other.v3), + v4(other.v4), + v5(other.v5), + v6(other.v6), + v7(other.v7), + v8(other.v8), + v9(other.v9), + v10(other.v10), + v11(other.v11), + v12(other.v12), + v13(other.v13), + v14(other.v14), + v15(other.v15), + v16(other.v16), + v17(other.v17), + v18(other.v18), + v19(other.v19), + v20(other.v20), + v21(other.v21), + v22(other.v22), + v23(other.v23), + v24(other.v24), + v25(other.v25), + v26(other.v26), + v27(other.v27), + v28(other.v28), + v29(other.v29), + v30(other.v30), + v31(other.v31), + v32(other.v32), + v33(other.v33), + v34(other.v34), + v35(other.v35), + v36(other.v36), + v37(other.v37), + v38(other.v38), + v39(other.v39), + v40(other.v40), + m_expected_sum(other.m_expected_sum), + m_side_effect(other.m_side_effect), + m_N(other.m_N) {} +#endif + + KOKKOS_INLINE_FUNCTION + double sum_views(int ii, bool execute_kernel) { + double result = 0.0; + if (execute_kernel) { + // This code is only executed when using the command line option -k + // The computation references all Kokkos views. This may help our + // effort to stop compilers from optimizing away the Kokkos views + for (int i = 0; i < m_N; ++i) { + result += v1(i) + v2(i) + v3(i) + v4(i) + v5(i) + v6(i) + v7(i) + + v8(i) + v9(i) + v10(i) + v11(i) + v12(i) + v13(i) + v14(i) + + v15(i) + v16(i) + v17(i) + v18(i) + v19(i) + v20(i) + v21(i) + + v22(i) + v23(i) + v24(i) + v25(i) + v26(i) + v27(i) + v28(i) + + v29(i) + v30(i) + v31(i) + v32(i) + v33(i) + v34(i) + v35(i) + + v36(i) + v37(i) + v38(i) + v39(i) + v40(i); + } + } else { + result = m_expected_sum; + } + // This statement introduces a side effect that may help our effort to + // stop compilers from optimizing away the temporary ViewCollection object + m_side_effect = result * (ii + 1); + return result; + } +}; + +void test_view_collection_kk(int N, int num_iter, bool execute_kernel) { + ViewCollection view_collection(N); + + Kokkos::Timer view_collection_timer; + double max_value = 0.0; + // Max Reduction boilerplate code taken from slide 53 of + // kokkos-tutorials/LectureSeries/KokkosTutorial_02_ViewsAndSpaces.pdf + Kokkos::parallel_reduce( + "collection-reduction", num_iter, + KOKKOS_LAMBDA(int i, double& valueToUpdate) { + // NOTE: The following lines expose the Kokkos View overheads + ViewCollection tmp_view_collection = view_collection; + double my_value = tmp_view_collection.sum_views(i, execute_kernel); + if (my_value > valueToUpdate) valueToUpdate = my_value; + }, + Kokkos::Max(max_value)); + double view_collection_time = view_collection_timer.seconds(); + + bool success = std::fabs(max_value - N * NVIEWS) < 1.E-6; + std::cout << "View Time = " << view_collection_time << " seconds" + << std::endl; + if (success) { + std::cout << "Kokkos run:" << std::endl; + std::cout << "SUCCESS" << std::endl; + } else { + std::cout << "FAILURE" << std::endl; + } +} + +void test_view_collection_serial(int N, int num_iter, bool execute_kernel) { + ViewCollection view_collection(N); + + Kokkos::Timer view_collection_timer; + double max_value = 0.0; + // Max Reduction boilerplate code taken from slide 53 of + // kokkos-tutorials/LectureSeries/KokkosTutorial_02_ViewsAndSpaces.pdf + for (int i = 0; i < num_iter; ++i) { + // NOTE: The following lines expose the Kokkos View overheads + ViewCollection tmp_view_collection = view_collection; + double my_value = tmp_view_collection.sum_views(i, execute_kernel); + if (my_value > max_value) max_value = my_value; + } + double view_collection_time = view_collection_timer.seconds(); + + bool success = std::fabs(max_value - N * NVIEWS) < 1.E-6; + std::cout << "View Time 2 = " << view_collection_time << " seconds" + << std::endl; + if (success) { + std::cout << "Serial run:" << std::endl; + std::cout << "SUCCESS" << std::endl; + } else { + std::cout << "FAILURE" << std::endl; + } +} + +int main(int argc, char* argv[]) { + // The benchmark is only testing reference counting for views on host. +#if defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_SERIAL) || \ + defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_HPX) + int N = 1; + int num_iter = 1 << 27; + bool execute_kernel = false; + + for (int i = 0; i < argc; i++) { + if ((strcmp(argv[i], "-N") == 0)) { + N = atoi(argv[++i]); + if (N < 1) { + std::cout << "Array extent must be >= 1" << std::endl; + exit(1); + } + } else if (strcmp(argv[i], "-i") == 0) { + num_iter = atoi(argv[++i]); + if (num_iter < 1) { + std::cout << "Number of iterations must be >= 1" << std::endl; + exit(1); + } + } else if (strcmp(argv[i], "-k") == 0) { + execute_kernel = true; + } else if ((strcmp(argv[i], "-h") == 0)) { + printf(" Options:\n"); + printf(" -N : Array extent\n"); + printf(" -i : Number of iterations\n"); + printf(" -k: Execute the summation kernel\n"); + printf(" -h: Print this message\n\n"); + exit(1); + } + } + + std::cout << "Array extent = " << N << std::endl; + std::cout << "Iterations = " << num_iter << std::endl; + std::cout << "Execute summation kernel = " << std::boolalpha << execute_kernel + << std::noboolalpha << std::endl; + + // Test inside a Kokkos kernel. + Kokkos::initialize(argc, argv); + { test_view_collection_kk(N, num_iter, execute_kernel); } + + // Test outside Kokkos kernel. + test_view_collection_serial(N, num_iter, execute_kernel); + + Kokkos::finalize(); +#endif + + return 0; +} diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index dbfef2267fe..d58645f98ad 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -233,7 +233,7 @@ do cuda_args="$cuda_args $1" ;; #Handle more known nvcc args - --extended-lambda|--expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler) + --extended-lambda|--expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler|--disable-warnings) cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument diff --git a/lib/kokkos/cmake/Dependencies.cmake b/lib/kokkos/cmake/Dependencies.cmake index 611c089b2e3..fb1e73b5799 100644 --- a/lib/kokkos/cmake/Dependencies.cmake +++ b/lib/kokkos/cmake/Dependencies.cmake @@ -1,6 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - TEST_OPTIONAL_TPLS CUSPARSE ) TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) diff --git a/lib/kokkos/cmake/ b/lib/kokkos/cmake/ index 8d5ef0de42f..d3ac39ffa31 100644 --- a/lib/kokkos/cmake/ +++ b/lib/kokkos/cmake/ @@ -225,8 +225,13 @@ FUNCTION(kokkos_compilation) # if built w/o CUDA support, we want to basically make this a no-op SET(_Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) + + IF(CMAKE_VERSION VERSION_GREATER_EQUAL 3.17) + SET(MAYBE_CURRENT_INSTALLATION_ROOT "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../../..") + ENDIF() + # search relative first and then absolute - SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") + SET(_HINTS "${MAYBE_CURRENT_INSTALLATION_ROOT}" "@CMAKE_INSTALL_PREFIX@") # find kokkos_launch_compiler FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER diff --git a/lib/kokkos/cmake/ b/lib/kokkos/cmake/ index 94f8fc4214f..7997aa3707c 100644 --- a/lib/kokkos/cmake/ +++ b/lib/kokkos/cmake/ @@ -52,6 +52,8 @@ #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION // deprecated #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN +#cmakedefine KOKKOS_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY +#cmakedefine KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND #cmakedefine KOKKOS_ENABLE_ATOMICS_BYPASS /* TPL Settings */ @@ -65,6 +67,7 @@ #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX #cmakedefine KOKKOS_ARCH_ARMV81 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2 +#cmakedefine KOKKOS_ARCH_ARMV9_GRACE #cmakedefine KOKKOS_ARCH_A64FX #cmakedefine KOKKOS_ARCH_AVX #cmakedefine KOKKOS_ARCH_AVX2 @@ -116,7 +119,6 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX942 #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 -#cmakedefine KOKKOS_ARCH_AMD_GFX1103 #cmakedefine KOKKOS_ARCH_AMD_GPU #cmakedefine KOKKOS_ARCH_VEGA // deprecated #cmakedefine KOKKOS_ARCH_VEGA906 // deprecated diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake index 5a62c530fce..445f4e93a59 100644 --- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -7,37 +7,38 @@ IF (NOT CUDAToolkit_ROOT) ENDIF() ENDIF() -# FIXME CMake 3.28.4 creates more targets than we export -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0" AND CMAKE_VERSION VERSION_LESS "3.28.4") - find_package(CUDAToolkit) -ELSE() - include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) -ENDIF() - - -IF (TARGET CUDA::cudart) - SET(FOUND_CUDART TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart) -ELSE() - SET(FOUND_CUDART FALSE) +IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") + MESSAGE(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") ENDIF() -IF (TARGET CUDA::cuda_driver) - SET(FOUND_CUDA_DRIVER TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) -ELSE() - SET(FOUND_CUDA_DRIVER FALSE) -ENDIF() - -include(FindPackageHandleStandardArgs) -IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_CUDA_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") -ELSE() - SET(KOKKOS_CUDA_ERROR DEFAULT_MSG) -ENDIF() -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${KOKKOS_CUDA_ERROR} FOUND_CUDART FOUND_CUDA_DRIVER) -IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) +IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") + find_package(CUDAToolkit REQUIRED) KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart ) + KOKKOS_EXPORT_CMAKE_TPL(CUDAToolkit REQUIRED) +ELSE() + include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) + + IF (TARGET CUDA::cudart) + SET(FOUND_CUDART TRUE) + KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart) + ELSE() + SET(FOUND_CUDART FALSE) + ENDIF() + + IF (TARGET CUDA::cuda_driver) + SET(FOUND_CUDA_DRIVER TRUE) + KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) + ELSE() + SET(FOUND_CUDA_DRIVER FALSE) + ENDIF() + + include(FindPackageHandleStandardArgs) + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) + IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) + KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE + LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart + ) + ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/deps/CUDA.cmake b/lib/kokkos/cmake/deps/CUDA.cmake index 68bf5b3d579..5b6afd61512 100644 --- a/lib/kokkos/cmake/deps/CUDA.cmake +++ b/lib/kokkos/cmake/deps/CUDA.cmake @@ -35,7 +35,6 @@ IF(NOT _CUDA_FAILURE) GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) ELSE() SET(TPL_ENABLE_CUDA OFF) ENDIF() diff --git a/lib/kokkos/cmake/deps/CUSPARSE.cmake b/lib/kokkos/cmake/deps/CUSPARSE.cmake deleted file mode 100644 index b016971ab91..00000000000 --- a/lib/kokkos/cmake/deps/CUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ************************************************************************ -# @HEADER - -#include(${TRIBITS_DEPS_DIR}/CUDA.cmake) - -#IF (TPL_ENABLE_CUDA) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) -# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -# KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) -#ENDIF() - diff --git a/lib/kokkos/cmake/fake_tribits.cmake b/lib/kokkos/cmake/fake_tribits.cmake index 4c5331ec793..a18d2ac518a 100644 --- a/lib/kokkos/cmake/fake_tribits.cmake +++ b/lib/kokkos/cmake/fake_tribits.cmake @@ -118,14 +118,6 @@ FUNCTION(KOKKOS_ADD_TEST) ENDIF() ENDFUNCTION() -FUNCTION(KOKKOS_ADD_ADVANCED_TEST) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_ADVANCED_TEST(${ARGN}) - else() - # TODO Write this - endif() -ENDFUNCTION() - MACRO(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME}) TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index df11c76cc3c..a581d9f9457 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -28,6 +28,7 @@ KOKKOS_CHECK_DEPRECATED_OPTIONS( #------------------------------------------------------------------------------- SET(KOKKOS_ARCH_LIST) +include(CheckCXXCompilerFlag) KOKKOS_DEPRECATED_LIST(ARCH ARCH) @@ -49,6 +50,7 @@ DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") +DECLARE_AND_CHECK_HOST_ARCH(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") @@ -101,9 +103,9 @@ LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) LIST(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) LIST(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) LIST(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) -LIST(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) +LIST(APPEND SUPPORTED_AMD_GPUS RX7900XTX RX7900XTX V620/W6800 V620/W6800) +LIST(APPEND SUPPORTED_AMD_ARCHS NAVI1100 AMD_GFX1100 NAVI1030 AMD_GFX1030) +LIST(APPEND CORRESPONDING_AMD_FLAGS gfx1100 gfx1100 gfx1030 gfx1030) #FIXME CAN BE REPLACED WITH LIST_ZIP IN CMAKE 3.17 FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) @@ -189,12 +191,6 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ELSEIF(CUDAToolkit_BIN_DIR) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) ENDIF() -ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - SET(CUDA_ARCH_FLAG "-gpu") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda) - IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda) - ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) SET(CUDA_ARCH_FLAG "-arch") ENDIF() @@ -209,6 +205,11 @@ ENDIF() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- +KOKKOS_OPTION(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") +KOKKOS_OPTION(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") +MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_FLAGS) +MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_LINK) + #clear anything that might be in the cache GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) IF(KOKKOS_ENABLE_HIP) @@ -301,6 +302,20 @@ IF (KOKKOS_ARCH_A64FX) ) ENDIF() +IF (KOKKOS_ARCH_ARMV9_GRACE) + SET(KOKKOS_ARCH_ARM_NEON ON) + check_cxx_compiler_flag("-mcpu=neoverse-n2" COMPILER_SUPPORTS_NEOVERSE_N2) + check_cxx_compiler_flag("-msve-vector-bits=128" COMPILER_SUPPORTS_SVE_VECTOR_BITS) + IF (COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) + COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128 + ) + ELSE() + MESSAGE(WARNING "Compiler does not support ARMv9 Grace architecture") + ENDIF() +ENDIF() + IF (KOKKOS_ARCH_ZEN) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID @@ -535,17 +550,17 @@ IF (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) SET(KOKKOS_ARCH_AVX512XEON OFF) ENDIF() +# FIXME_NVCC nvcc doesn't seem to support Arm Neon. +IF(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + UNSET(KOKKOS_ARCH_ARM_NEON) +ENDIF() + IF (NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) COMPILER_SPECIFIC_FLAGS( Clang -fcuda-rdc NVIDIA --relocatable-device-code=true - NVHPC -gpu=rdc ) - ELSEIF(KOKKOS_ENABLE_CUDA) - COMPILER_SPECIFIC_FLAGS( - NVHPC -gpu=nordc - ) ENDIF() ENDIF() @@ -571,7 +586,7 @@ IF (KOKKOS_ENABLE_HIP) COMPILER_SPECIFIC_FLAGS( DEFAULT -fgpu-rdc ) - IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) COMPILER_SPECIFIC_LINK_OPTIONS( DEFAULT --hip-link ) @@ -654,15 +669,9 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) IF(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) SET(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) ELSE() - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${FLAG}) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${NVHPC_CUDA_ARCH}") - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${NVHPC_CUDA_ARCH}") - ELSE() - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - ENDIF() + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") ENDIF() ENDIF() ENDIF() @@ -704,14 +713,16 @@ FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) ELSE() - IF(KOKKOS_ENABLE_HIP) - SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() + IF(KOKKOS_ENABLE_HIP) + SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) + ENDIF() + IF(NOT KOKKOS_IMPL_AMDGPU_FLAGS) + SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + ENDIF() + IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + ENDIF() ENDIF() ENDIF() ENDFUNCTION() @@ -724,6 +735,15 @@ FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) ENDFOREACH() +IF(KOKKOS_IMPL_AMDGPU_FLAGS) + IF (NOT AMDGPU_ARCH_ALREADY_SPECIFIED) + MESSAGE(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " + "Please explicitly set the GPU architecture.") + ENDIF() + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") +ENDIF() + MACRO(SET_AND_CHECK_AMD_ARCH ARCH FLAG) KOKKOS_SET_OPTION(ARCH_${ARCH} ON) CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) @@ -984,7 +1004,7 @@ IF (KOKKOS_ARCH_HOPPER90) ENDIF() #HIP detection of gpu arch -IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED) +IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator) IF(NOT ROCM_ENUMERATOR) MESSAGE(FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index 9135ca2b41c..e8bfadb64eb 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -42,12 +42,8 @@ IF(Kokkos_ENABLE_CUDA) # If launcher was found and nvcc_wrapper was not specified as # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher. # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper - IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang - AND NOT (Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) + IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) IF(CMAKE_CXX_COMPILER_LAUNCHER) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - MESSAGE(STATUS "Using nvc++ as device compiler requires Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON!") - ENDIF() MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!") ENDIF() # the first argument to launcher is always the C++ compiler defined by cmake @@ -149,56 +145,85 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) ENDIF() # Enforce the minimum compilers supported by Kokkos. -SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) 8.0.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) 10.0.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) 15.0.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 8.2.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 19.0.5 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) 2021.1.1 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) 2023.0.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 11.0.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 5.2.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI 22.3 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC 19.29 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") +IF(NOT CMAKE_CXX_STANDARD) + SET(CMAKE_CXX_STANDARD 17) +ENDIF() +IF(CMAKE_CXX_STANDARD EQUAL 17) + SET(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) + SET(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) + SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + SET(KOKKOS_GCC_MINIMUM 8.2.0) + SET(KOKKOS_INTEL_MINIMUM 19.0.5) + SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) + SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + SET(KOKKOS_NVCC_MINIMUM 11.0.0) + SET(KOKKOS_HIPCC_MINIMUM 5.2.0) + SET(KOKKOS_NVHPC_MINIMUM 22.3) + SET(KOKKOS_MSVC_MINIMUM 19.29) +ELSE() + SET(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) + SET(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) + SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + SET(KOKKOS_GCC_MINIMUM 10.1.0) + SET(KOKKOS_INTEL_MINIMUM "not supported") + SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) + SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + SET(KOKKOS_NVCC_MINIMUM 12.0.0) + SET(KOKKOS_HIPCC_MINIMUM 5.2.0) + SET(KOKKOS_NVHPC_MINIMUM 22.3) + SET(KOKKOS_MSVC_MINIMUM 19.30) +ENDIF() + +SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 8.0.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 10.0.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 8.2.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 19.0.5) + IF((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2021.1.1) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2023.0.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.0.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 5.2.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 22.3) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() # Treat PGI internally as NVHPC to simplify handling both compilers. @@ -206,13 +231,13 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NV # backward-compatible to pgc++. SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 19.29) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index 32788e7aa0f..b900c4a232e 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -75,8 +75,12 @@ KOKKOS_ENABLE_OPTION(IMPL_HIP_UNIFIED_MEMORY OFF "Whether to leverage unified me # This option will go away eventually, but allows fallback to old implementation when needed. KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") +KOKKOS_ENABLE_OPTION(IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting") +mark_as_advanced(Kokkos_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY) +KOKKOS_ENABLE_OPTION(IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction.") +mark_as_advanced(Kokkos_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND) -KOKKOS_ENABLE_OPTION(IMPL_MDSPAN OFF "Whether to enable experimental mdspan support") +KOKKOS_ENABLE_OPTION(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") KOKKOS_ENABLE_OPTION(IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan") mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index 9dab1ca00ea..d1f1e0d7a78 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -709,7 +709,12 @@ MACRO(kokkos_find_imported NAME) ENDIF() IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib lib64) + SET(TPL_LIBRARY_SUFFIXES lib) + IF(KOKKOS_IMPL_32BIT) + LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) + ELSE() + LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) + ENDIF() ENDIF() SET(${NAME}_INCLUDE_DIRS) diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index b075a3e36b5..5b45674e057 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -124,12 +124,8 @@ IF(KOKKOS_ENABLE_CUDA) ELSEIF(CMAKE_CXX_EXTENSIONS) MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") ENDIF() - ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND NOT (Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. To allow nvc++ as Cuda compiler, Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON must be set!") - ELSE() - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or NVC++ or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") - ENDIF() + ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index 6ef3b79bde2..cda9e0d6004 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -103,13 +103,19 @@ if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) endif() IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED) + find_package(OpenMP REQUIRED COMPONENTS CXX) # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency # so we just append the flags here instead of linking with the OpenMP target. IF(KOKKOS_HAS_TRILINOS) COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED) + KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) + ENDIF() + IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + ENDIF() + IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 060a7a8472c..6da543a2c85 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -160,6 +160,12 @@ FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) ) ENDIF() ENDIF() + # We noticed problems with -fvisibility=hidden for inline static variables + # if Kokkos was built as shared library. + IF(BUILD_SHARED_LIBS) + SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) + SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) + ENDIF() ENDFUNCTION() FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) @@ -241,34 +247,6 @@ MACRO(KOKKOS_CONFIGURE_CORE) KOKKOS_CONFIG_HEADER( KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") KOKKOS_CONFIG_HEADER( KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") KOKKOS_CONFIG_HEADER( KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") - SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace") - KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space") - KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space") - KOKKOS_OPTION(DEFAULT_DEVICE_EXECUTION_SPACE "" STRING "Override default device execution space") - KOKKOS_OPTION(DEFAULT_HOST_PARALLEL_EXECUTION_SPACE "" STRING "Override default host parallel execution space") - IF (NOT Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE STREQUAL "") - SET(_DEVICE_PARALLEL ${Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE}) - MESSAGE(STATUS "Override default device execution space: ${_DEVICE_PARALLEL}") - SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) - ELSE() - IF (_DEVICE_PARALLEL STREQUAL "NoTypeDefined") - SET(KOKKOS_DEVICE_SPACE_ACTIVE OFF) - ELSE() - SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) - ENDIF() - ENDIF() - IF (NOT Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE STREQUAL "") - SET(_HOST_PARALLEL ${Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE}) - MESSAGE(STATUS "Override default host parallel execution space: ${_HOST_PARALLEL}") - SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) - ELSE() - IF (_HOST_PARALLEL STREQUAL "NoTypeDefined") - SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE OFF) - ELSE() - SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) - ENDIF() - ENDIF() - #We are ready to configure the header CONFIGURE_FILE(cmake/ KokkosCore_config.h @ONLY) ENDMACRO() @@ -484,15 +462,10 @@ ENDFUNCTION() FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) - IF(KOKKOS_HAS_TRILINOS) - #ignore the target, tribits doesn't do anything directly with targets - TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) - ELSE() #append to a list for later - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - FOREACH(DIR ${ARGN}) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} $) - ENDFOREACH() - ENDIF() + KOKKOS_LIB_TYPE(${TARGET} INCTYPE) + FOREACH(DIR ${ARGN}) + TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} $) + ENDFOREACH() ENDFUNCTION() FUNCTION(KOKKOS_LIB_COMPILE_OPTIONS TARGET) diff --git a/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake b/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake deleted file mode 100644 index 4709f8002b1..00000000000 --- a/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#@HEADER - -# Check for CUDA support - -IF (NOT TPL_ENABLE_CUDA) - MESSAGE(FATAL_ERROR "\nCUSPARSE requires CUDA") -ELSE() - GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) - GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -ENDIF() - diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index e821570a8d5..a37a2bdcebd 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -944,13 +944,13 @@ class DualView : public ViewTraits { if (sizeMismatch) { ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { h_view = create_mirror_view(typename t_host::memory_space(), d_view); } else { h_view = create_mirror_view(Kokkos::WithoutInitializing, typename t_host::memory_space(), d_view); } - } else if (alloc_prop_input::initialize) { + } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -1038,12 +1038,10 @@ class DualView : public ViewTraits { /* Resize on Device */ if (sizeMismatch) { ::Kokkos::resize(properties, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { - h_view = create_mirror_view(typename t_host::memory_space(), d_view); - } else { - h_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_host::memory_space(), d_view); - } + // this part of the lambda was relocated in a method as it contains a + // `if constexpr`. In some cases, both branches were evaluated + // leading to a compile error + resync_host(properties); /* Mark Device copy as modified */ ++modified_flags(1); @@ -1054,13 +1052,10 @@ class DualView : public ViewTraits { /* Resize on Host */ if (sizeMismatch) { ::Kokkos::resize(properties, h_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { - d_view = create_mirror_view(typename t_dev::memory_space(), h_view); - - } else { - d_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_dev::memory_space(), h_view); - } + // this part of the lambda was relocated in a method as it contains a + // `if constexpr`. In some cases, both branches were evaluated + // leading to a compile error + resync_device(properties); /* Mark Host copy as modified */ ++modified_flags(0); @@ -1099,6 +1094,39 @@ class DualView : public ViewTraits { } } + private: + // resync host mirror from device + // this code was relocated from a lambda as it contains a `if constexpr`. + // In some cases, both branches were evaluated, leading to a compile error + template + inline void resync_host(Impl::ViewCtorProp const&) { + using alloc_prop_input = Impl::ViewCtorProp; + + if constexpr (alloc_prop_input::initialize) { + h_view = create_mirror_view(typename t_host::memory_space(), d_view); + } else { + h_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_host::memory_space(), d_view); + } + } + + // resync device mirror from host + // this code was relocated from a lambda as it contains a `if constexpr` + // In some cases, both branches were evaluated leading to a compile error + template + inline void resync_device(Impl::ViewCtorProp const&) { + using alloc_prop_input = Impl::ViewCtorProp; + + if constexpr (alloc_prop_input::initialize) { + d_view = create_mirror_view(typename t_dev::memory_space(), h_view); + + } else { + d_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_dev::memory_space(), h_view); + } + } + + public: void resize(const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 5fa59f1b7cd..5f7fcaf69e7 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -1657,8 +1657,7 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v || - is_layouttiled::value) { + std::is_same_v) { for (int i = N; i < 7; ++i) layout.dimension[i] = KOKKOS_IMPL_CTOR_DEFAULT_ARG; } @@ -1933,254 +1932,155 @@ struct MirrorDRVType { } // namespace Impl namespace Impl { -template -inline typename DynRankView::HostMirror create_mirror( - const DynRankView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using src_type = DynRankView; - using dst_type = typename src_type::HostMirror; - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc +template +inline auto create_mirror(const DynRankView& src, + const Impl::ViewCtorProp& arg_prop) { + check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); -} - -template -inline auto create_mirror( - const DynRankView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using dst_type = typename Impl::MirrorDRVType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using dst_type = typename Impl::MirrorDRVType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); + } else { + using src_type = DynRankView; + using dst_type = typename src_type::HostMirror; - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } } // namespace Impl -// Create a mirror in host space -template -inline typename DynRankView::HostMirror create_mirror( - const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = nullptr) { - return Impl::create_mirror(src, Kokkos::Impl::ViewCtorProp<>{}); +// public interface +template ::specialize>>> +inline auto create_mirror(const DynRankView& src) { + return Impl::create_mirror(src, Kokkos::view_alloc()); } -template -inline typename DynRankView::HostMirror create_mirror( - Kokkos::Impl::WithoutInitializing_t wi, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = nullptr) { +// public interface that accepts a without initializing flag +template ::specialize>>> +inline auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, + const DynRankView& src) { return Impl::create_mirror(src, Kokkos::view_alloc(wi)); } -template -inline typename DynRankView::HostMirror create_mirror( - const Impl::ViewCtorProp& arg_prop, - const DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::ViewCtorProp::has_memory_space>* = nullptr) { - return Impl::create_mirror(src, arg_prop); -} - -// Create a mirror in a new space +// public interface that accepts a space template ::value && - std::is_void::specialize>::value>> -typename Impl::MirrorDRVType::view_type create_mirror( - const Space&, const Kokkos::DynRankView& src) { + std::is_void_v::specialize>>> +auto create_mirror(const Space&, const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } -template -typename Impl::MirrorDRVType::view_type create_mirror( - Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = nullptr) { +// public interface that accepts a space and a without initializing flag +template ::value && + std::is_void_v::specialize>>> +auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } -template -inline auto create_mirror( - const Impl::ViewCtorProp& arg_prop, - const DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::ViewCtorProp::has_memory_space>* = nullptr) { - using ReturnType = typename Impl::MirrorDRVType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - return ReturnType{Impl::create_mirror(src, arg_prop)}; +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>>> +inline auto create_mirror(const Impl::ViewCtorProp& arg_prop, + const DynRankView& src) { + return Impl::create_mirror(src, arg_prop); } namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value, - typename DynRankView::HostMirror> -create_mirror_view(const DynRankView& src, - const typename Impl::ViewCtorProp&) { - return src; -} +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value), - typename DynRankView::HostMirror> -create_mirror_view( +inline auto create_mirror_view( const DynRankView& src, - const typename Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -template ::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp::memory_space>::value && - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type> -create_mirror_view(const Kokkos::DynRankView& src, - const typename Impl::ViewCtorProp&) { - return src; + [[maybe_unused]] const typename Impl::ViewCtorProp& + arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename DynRankView< + T, P...>::HostMirror::data_type>::value) { + return typename DynRankView::HostMirror(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorDRViewType::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } -template ::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp::memory_space>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type> -create_mirror_view( - const Kokkos::DynRankView& src, - const typename Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl -// Create a mirror view in host space +// public interface template -inline std::enable_if_t< - (std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same::data_type, - typename DynRankView::HostMirror::data_type>::value), - typename DynRankView::HostMirror> -create_mirror_view(const Kokkos::DynRankView& src) { - return src; -} - -template -inline std::enable_if_t< - !(std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value), - typename DynRankView::HostMirror> -create_mirror_view(const Kokkos::DynRankView& src) { - return Kokkos::create_mirror(src); +inline auto create_mirror_view(const Kokkos::DynRankView& src) { + return Impl::create_mirror_view(src, Kokkos::view_alloc()); } +// public interface that accepts a without initializing flag template inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, const DynRankView& src) { return Impl::create_mirror_view(src, Kokkos::view_alloc(wi)); } -// Create a mirror view in a new space -// FIXME_C++17 Improve SFINAE here. +// public interface that accepts a space template ::value>> -inline typename Impl::MirrorDRViewType::view_type -create_mirror_view( - const Space&, const Kokkos::DynRankView& src, - std::enable_if_t< - Impl::MirrorDRViewType::is_same_memspace>* = nullptr) { - return src; +inline auto create_mirror_view(const Space&, + const Kokkos::DynRankView& src) { + return Impl::create_mirror_view( + src, Kokkos::view_alloc(typename Space::memory_space())); } -// FIXME_C++17 Improve SFINAE here. +// public interface that accepts a space and a without initializing flag template ::value>> -inline typename Impl::MirrorDRViewType::view_type -create_mirror_view( - const Space& space, const Kokkos::DynRankView& src, - std::enable_if_t< - !Impl::MirrorDRViewType::is_same_memspace>* = nullptr) { - return Kokkos::create_mirror(space, src); -} - -template + typename Enable = std::enable_if_t::value>> inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::DynRankView& src) { @@ -2188,6 +2088,8 @@ inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template inline auto create_mirror_view( const typename Impl::ViewCtorProp& arg_prop, @@ -2195,75 +2097,51 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, arg_prop); } -template +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, + const Kokkos::DynRankView& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; -} -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, - const Kokkos::DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { - using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorDRViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{ - arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorDRViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{ + arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } +#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC) + __builtin_unreachable(); +#endif } template diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index 12885edbae9..a4b74e246e0 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -590,96 +590,81 @@ struct MirrorDynamicViewType { } // namespace Impl namespace Impl { + +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline auto create_mirror( - const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { +inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, + const Impl::ViewCtorProp& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( - prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - - ret.resize_serial(src.extent(0)); - - return ret; -} + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using MemorySpace = typename alloc_prop_input::memory_space; -template -inline auto create_mirror( - const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using alloc_prop_input = Impl::ViewCtorProp; + auto ret = typename Kokkos::Impl::MirrorDynamicViewType< + MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), + src.chunk_max() * src.chunk_size()); - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); - - using MemorySpace = typename alloc_prop_input::memory_space; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + ret.resize_serial(src.extent(0)); - auto ret = typename Kokkos::Impl::MirrorDynamicViewType< - MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), - src.chunk_max() * src.chunk_size()); + return ret; + } else { + auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( + prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - ret.resize_serial(src.extent(0)); + ret.resize_serial(src.extent(0)); - return ret; + return ret; + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } + } // namespace Impl -// Create a mirror in host space -template +// public interface +template ::specialize>>> inline auto create_mirror( const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror(src, Impl::ViewCtorProp<>{}); } -template +// public interface that accepts a without initializing flag +template ::specialize>>> inline auto create_mirror( Kokkos::Impl::WithoutInitializing_t wi, const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror(src, Kokkos::view_alloc(wi)); } -// Create a mirror in a new space -template +// public interface that accepts a space +template ::value && + std::is_void_v::specialize>>> inline auto create_mirror( const Space&, const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } -template +// public interface that accepts a space and a without initializing flag +template ::value && + std::is_void_v::specialize>>> typename Kokkos::Impl::MirrorDynamicViewType::view_type create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::DynamicView& src) { @@ -687,7 +672,11 @@ create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } -template +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>>> inline auto create_mirror( const Impl::ViewCtorProp& arg_prop, const Kokkos::Experimental::DynamicView& src) { @@ -696,76 +685,56 @@ inline auto create_mirror( namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp&) { - return src; -} - -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp&) { - return src; +inline auto create_mirror_view( + const Kokkos::Experimental::DynamicView& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>::value) { + return + typename Kokkos::Experimental::DynamicView::HostMirror(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp< + ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl -// Create a mirror view in host space +// public interface template inline auto create_mirror_view( const typename Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror_view(src, Impl::ViewCtorProp<>{}); } +// public interface that accepts a without initializing flag template inline auto create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, @@ -773,15 +742,18 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, Kokkos::view_alloc(wi)); } -// Create a mirror in a new space -template +// public interface that accepts a space +template ::value>> inline auto create_mirror_view( const Space&, const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror_view(src, view_alloc(typename Space::memory_space{})); } -template +// public interface that accepts a space and a without initializing flag +template ::value>> inline auto create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::DynamicView& src) { @@ -789,6 +761,8 @@ inline auto create_mirror_view( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template inline auto create_mirror_view( const Impl::ViewCtorProp& arg_prop, @@ -985,80 +959,57 @@ struct ViewCopy, } // namespace Impl -template +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::Experimental::DynamicView& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, + const Kokkos::Experimental::DynamicView& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; -} -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, - const Kokkos::Experimental::DynamicView& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { - using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = - typename Impl::MirrorDynamicViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type( - arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - mirror.resize_serial(src.extent(0)); - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = + typename Impl::MirrorDynamicViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type( + arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); + mirror.resize_serial(src.extent(0)); + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } +#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC) + __builtin_unreachable(); +#endif } -template +template ::value>> auto create_mirror_view_and_copy( const Space&, const Kokkos::Experimental::DynamicView& src, std::string const& name = "") { diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 91a7e4a9273..3adc70b1904 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -471,62 +471,31 @@ class OffsetView : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::are_integral::value && (2 == Rank) && - is_default_map && is_layout_left && (traits::rank_dynamic == 0)), + is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), reference_type> operator()(const I0& i0, const I1& i1) const { KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) const size_t j0 = i0 - m_begins[0]; const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_dim.N0 * j1]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (2 == Rank) && - is_default_map && is_layout_left && (traits::rank_dynamic != 0)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_stride * j1]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (2 == Rank) && - is_default_map && is_layout_right && (traits::rank_dynamic == 0)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_dim.N1 * j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (2 == Rank) && - is_default_map && is_layout_right && (traits::rank_dynamic != 0)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_stride * j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (2 == Rank) && is_default_map && is_layout_stride), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j0 * m_map.m_impl_offset.m_stride.S0 + - j1 * m_map.m_impl_offset.m_stride.S1]; + if constexpr (is_layout_left) { + if constexpr (traits::rank_dynamic == 0) + return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_dim.N0 * j1]; + else + return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_stride * j1]; + } else if constexpr (is_layout_right) { + if constexpr (traits::rank_dynamic == 0) + return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_dim.N1 * j0]; + else + return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_stride * j0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[j0 * m_map.m_impl_offset.m_stride.S0 + + j1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined(KOKKOS_COMPILER_INTEL) + __builtin_unreachable(); +#endif } //------------------------------ @@ -1841,71 +1810,73 @@ struct MirrorOffsetType { } // namespace Impl namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space, - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return typename Kokkos::Experimental::OffsetView::HostMirror( - Kokkos::create_mirror(arg_prop, src.view()), src.begins()); -} -template ::has_memory_space>> +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc +template inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { - using alloc_prop_input = Impl::ViewCtorProp; - using Space = typename Impl::ViewCtorProp::memory_space; + check_view_ctor_args_create_mirror(); - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using Space = typename Impl::ViewCtorProp::memory_space; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + auto prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string(src.label()).append("_mirror")); - return typename Kokkos::Impl::MirrorOffsetType::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + return typename Kokkos::Impl::MirrorOffsetType::view_type( + prop_copy, src.layout(), + {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), + src.begin(5), src.begin(6), src.begin(7)}); + } else { + return typename Kokkos::Experimental::OffsetView::HostMirror( + Kokkos::create_mirror(arg_prop, src.view()), src.begins()); + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } + } // namespace Impl -// Create a mirror in host space -template +// public interface +template ::specialize>>> inline auto create_mirror( const Kokkos::Experimental::OffsetView& src) { return Impl::create_mirror(src, Impl::ViewCtorProp<>{}); } -template +// public interface that accepts a without initializing flag +template ::specialize>>> inline auto create_mirror( Kokkos::Impl::WithoutInitializing_t wi, const Kokkos::Experimental::OffsetView& src) { return Impl::create_mirror(src, Kokkos::view_alloc(wi)); } -// Create a mirror in a new space +// public interface that accepts a space template ::value>> + typename Enable = std::enable_if_t< + Kokkos::is_space::value && + std::is_void_v::specialize>>> inline auto create_mirror( const Space&, const Kokkos::Experimental::OffsetView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } -template +// public interface that accepts a space and a without initializing flag +template ::value && + std::is_void_v::specialize>>> typename Kokkos::Impl::MirrorOffsetType::view_type create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::OffsetView& src) { @@ -1913,7 +1884,11 @@ create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } -template +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>>> inline auto create_mirror( const Impl::ViewCtorProp& arg_prop, const Kokkos::Experimental::OffsetView& src) { @@ -1921,76 +1896,56 @@ inline auto create_mirror( } namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp&) { - return src; -} +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp&) { - return src; +inline auto create_mirror_view( + const Kokkos::Experimental::OffsetView& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value) { + return + typename Kokkos::Experimental::OffsetView::HostMirror(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorOffsetViewType::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl -// Create a mirror view in host space +// public interface template inline auto create_mirror_view( const typename Kokkos::Experimental::OffsetView& src) { return Impl::create_mirror_view(src, Impl::ViewCtorProp<>{}); } +// public interface that accepts a without initializing flag template inline auto create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, @@ -1998,7 +1953,7 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, Kokkos::view_alloc(wi)); } -// Create a mirror view in a new space +// public interface that accepts a space template ::value>> inline auto create_mirror_view( @@ -2007,7 +1962,9 @@ inline auto create_mirror_view( src, Kokkos::view_alloc(typename Space::memory_space{})); } -template +// public interface that accepts a space and a without initializing flag +template ::value>> inline auto create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::OffsetView& src) { @@ -2015,6 +1972,8 @@ inline auto create_mirror_view( src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template inline auto create_mirror_view( const Impl::ViewCtorProp& arg_prop, @@ -2022,7 +1981,9 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, arg_prop); } -// Create a mirror view and deep_copy in a new space +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template typename Kokkos::Impl::MirrorOffsetViewType< typename Impl::ViewCtorProp::memory_space, T, diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index 78a6a238ece..c3a8b67df8d 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -805,56 +805,94 @@ class UnorderedMap { return *this; } + // Re-allocate the views of the calling UnorderedMap according to src + // capacity, and deep copy the src data. template std::enable_if_t, key_type>::value && std::is_same, value_type>::value> create_copy_view( UnorderedMap const &src) { if ( != { - insertable_map_type tmp; - - tmp.m_bounded_insert = src.m_bounded_insert; - tmp.m_hasher = src.m_hasher; - tmp.m_equal_to = src.m_equal_to; - tmp.m_size() = src.m_size(); - tmp.m_available_indexes = bitset_type(src.capacity()); - tmp.m_hash_lists = size_type_view( - view_alloc(WithoutInitializing, "UnorderedMap hash list"), - src.m_hash_lists.extent(0)); - tmp.m_next_index = size_type_view( - view_alloc(WithoutInitializing, "UnorderedMap next index"), - src.m_next_index.extent(0)); - tmp.m_keys = - key_type_view(view_alloc(WithoutInitializing, "UnorderedMap keys"), - src.m_keys.extent(0)); - tmp.m_values = value_type_view( - view_alloc(WithoutInitializing, "UnorderedMap values"), - src.m_values.extent(0)); - tmp.m_scalars = scalars_view("UnorderedMap scalars"); - - Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes); + allocate_view(src); + deep_copy_view(src); + } + } + + // Allocate views of the calling UnorderedMap with the same capacity as the + // src. + template + std::enable_if_t, key_type>::value && + std::is_same, value_type>::value> + allocate_view( + UnorderedMap const &src) { + insertable_map_type tmp; + + tmp.m_bounded_insert = src.m_bounded_insert; + tmp.m_hasher = src.m_hasher; + tmp.m_equal_to = src.m_equal_to; + tmp.m_size() = src.m_size(); + tmp.m_available_indexes = bitset_type(src.capacity()); + tmp.m_hash_lists = size_type_view( + view_alloc(WithoutInitializing, "UnorderedMap hash list"), + src.m_hash_lists.extent(0)); + tmp.m_next_index = size_type_view( + view_alloc(WithoutInitializing, "UnorderedMap next index"), + src.m_next_index.extent(0)); + tmp.m_keys = + key_type_view(view_alloc(WithoutInitializing, "UnorderedMap keys"), + src.m_keys.extent(0)); + tmp.m_values = + value_type_view(view_alloc(WithoutInitializing, "UnorderedMap values"), + src.m_values.extent(0)); + tmp.m_scalars = scalars_view("UnorderedMap scalars"); + + *this = tmp; + } + + // Deep copy view data from src. This requires that the src capacity is + // identical to the capacity of the calling UnorderedMap. + template + std::enable_if_t, key_type>::value && + std::is_same, value_type>::value> + deep_copy_view( + UnorderedMap const &src) { +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // To deep copy UnorderedMap, capacity must be identical + KOKKOS_EXPECTS(capacity() == src.capacity()); +#else + if (capacity() != src.capacity()) { + allocate_view(src); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning( + "Warning: deep_copy_view() allocating views is deprecated. Must call " + "with UnorderedMaps of identical capacity, or use " + "create_copy_view().\n"); +#endif + } +#endif + + if ( != { + Kokkos::deep_copy(m_available_indexes, src.m_available_indexes); using raw_deep_copy = Kokkos::Impl::DeepCopy; - raw_deep_copy(,, + raw_deep_copy(,, sizeof(size_type) * src.m_hash_lists.extent(0)); - raw_deep_copy(,, + raw_deep_copy(,, sizeof(size_type) * src.m_next_index.extent(0)); - raw_deep_copy(,, + raw_deep_copy(,, sizeof(key_type) * src.m_keys.extent(0)); if (!is_set) { - raw_deep_copy(,, + raw_deep_copy(,, sizeof(impl_value_type) * src.m_values.extent(0)); } - raw_deep_copy(,, + raw_deep_copy(,, sizeof(int) * num_scalars); Kokkos::fence( - "Kokkos::UnorderedMap::create_copy_view: fence after copy to tmp"); - - *this = tmp; + "Kokkos::UnorderedMap::deep_copy_view: fence after copy to dst."); } } @@ -932,13 +970,25 @@ class UnorderedMap { friend struct Impl::UnorderedMapPrint; }; -// Specialization of deep_copy for two UnorderedMap objects. +// Specialization of deep_copy() for two UnorderedMap objects. template inline void deep_copy( UnorderedMap &dst, const UnorderedMap &src) { - dst.create_copy_view(src); + dst.deep_copy_view(src); +} + +// Specialization of create_mirror() for an UnorderedMap object. +template +typename UnorderedMap::HostMirror +create_mirror( + const UnorderedMap &src) { + typename UnorderedMap::HostMirror + dst; + dst.allocate_view(src); + return dst; } } // namespace Kokkos diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index a15e5fa2997..2512cb5c491 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -55,8 +55,8 @@ struct test_dualview_alloc { bool result = false; test_dualview_alloc(unsigned int size) { - result = run_me >( - size, 3); + result = + run_me>(size, 3); } }; @@ -154,7 +154,7 @@ struct test_dualview_combinations { } test_dualview_combinations(unsigned int size, bool with_init) { - result = run_me >( + result = run_me>( size, 3, with_init); } }; @@ -253,21 +253,18 @@ struct test_dual_view_deep_copy { } // end run_me test_dual_view_deep_copy() { - run_me >(10, 5, - true); - run_me >(10, 5, - false); + run_me>(10, 5, true); + run_me>(10, 5, + false); // Test zero length but allocated (!=nullptr but // a.d_view.span()==0) - run_me >(0, 5, true); - run_me >(0, 5, - false); + run_me>(0, 5, true); + run_me>(0, 5, false); // Test default constructed view - run_me >(-1, 5, - true); - run_me >(-1, 5, - false); + run_me>(-1, 5, true); + run_me>(-1, 5, + false); } }; @@ -282,15 +279,20 @@ struct test_dualview_resize { const unsigned int m = 5; const unsigned int factor = 2; - ViewType a("A", n, m); + ViewType a; + if constexpr (Initialize) + a = ViewType("A", n, m); + else + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::deep_copy(a.d_view, 1); /* Covers case "Resize on Device" */ a.modify_device(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); - else + if constexpr (Initialize) Kokkos::resize(a, factor * n, factor * m); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); ASSERT_EQ(a.extent(0), n * factor); ASSERT_EQ(a.extent(1), m * factor); @@ -298,33 +300,38 @@ struct test_dualview_resize { a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::View errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(; + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, a_d_sum); - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); /* Covers case "Resize on Host" */ a.modify_host(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); - else + if constexpr (Initialize) Kokkos::resize(a, n / factor, m / factor); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); ASSERT_EQ(a.extent(0), n / factor); ASSERT_EQ(a.extent(1), m / factor); @@ -332,30 +339,33 @@ struct test_dualview_resize { a.sync_device(Kokkos::DefaultExecutionSpace{}); // Check device view is initialized as expected - a_d_sum = 0; + Kokkos::deep_copy(errors_d, 0); // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(; + }); + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - a_h_sum = 0; + errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_resize() { - run_me >(); + run_me>(); } }; @@ -369,40 +379,51 @@ struct test_dualview_realloc { const unsigned int n = 10; const unsigned int m = 5; - ViewType a("A", n, m); - if (Initialize) - Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); - else + ViewType a; + if constexpr (Initialize) { + a = ViewType("A", n, m); Kokkos::realloc(a, n, m); + } else { + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); + } + ASSERT_EQ(a.extent(0), n); + ASSERT_EQ(a.extent(1), m); Kokkos::deep_copy(a.d_view, 1); + a.modify_device(); a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::View errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(; + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_realloc() { - run_me >(); + run_me>(); } }; @@ -463,12 +484,23 @@ TEST(TEST_CATEGORY, dualview_deep_copy) { test_dualview_deep_copy(); } +struct NoDefaultConstructor { + NoDefaultConstructor(int i_) : i(i_) {} + KOKKOS_FUNCTION operator int() const { return i; } + + int i; +}; + TEST(TEST_CATEGORY, dualview_realloc) { test_dualview_realloc(); + Impl::test_dualview_realloc(); } TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize(); + Impl::test_dualview_resize(); } namespace { diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp index f63f1c6afe3..4a7e826ecbe 100644 --- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -68,7 +68,7 @@ struct TestInsert { } while (rehash_on_fail && failed_count > 0u); // Trigger the m_size mutable bug. - typename map_type::HostMirror map_h; + auto map_h = create_mirror(map); execution_space().fence(); Kokkos::deep_copy(map_h, map); execution_space().fence(); @@ -367,7 +367,7 @@ void test_deep_copy(uint32_t num_nodes) { } } - host_map_type hmap; + auto hmap = create_mirror(map); Kokkos::deep_copy(hmap, map); ASSERT_EQ(map.size(), hmap.size()); @@ -380,6 +380,7 @@ void test_deep_copy(uint32_t num_nodes) { } map_type mmap; + mmap.allocate_view(hmap); Kokkos::deep_copy(mmap, hmap); const_map_type cmap = mmap; @@ -424,7 +425,7 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) { Map n{}; n = Map{m.capacity()}; n.rehash(m.capacity()); - Kokkos::deep_copy(n, m); + n.create_copy_view(m); ASSERT_TRUE(m.is_allocated()); ASSERT_TRUE(n.is_allocated()); } diff --git a/lib/kokkos/containers/unit_tests/TestVector.hpp b/lib/kokkos/containers/unit_tests/TestVector.hpp index a7d341b789d..abed2676d76 100644 --- a/lib/kokkos/containers/unit_tests/TestVector.hpp +++ b/lib/kokkos/containers/unit_tests/TestVector.hpp @@ -21,6 +21,8 @@ #include #include #include +#include +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #include namespace Test { @@ -231,7 +233,7 @@ void test_vector_allocate(unsigned int size) { TEST(TEST_CATEGORY, vector_combination) { test_vector_allocate(10); test_vector_combinations(10); - test_vector_combinations(3057); + test_vector_combinations(3057); } TEST(TEST_CATEGORY, vector_insert) { diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp index ce3059f47d3..af74723e7e0 100644 --- a/lib/kokkos/core/perf_test/test_atomic.cpp +++ b/lib/kokkos/core/perf_test/test_atomic.cpp @@ -390,7 +390,7 @@ static void Test_Atomic(benchmark::State& state) { static constexpr int LOOP = 100'000; -BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic)->Arg(30'000)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); @@ -398,4 +398,3 @@ BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); -BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); diff --git a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp index b838c8eccf0..bc35d1c776f 100644 --- a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp +++ b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp @@ -183,7 +183,8 @@ double atomic_contentious_max_replacement(benchmark::State& state, Kokkos::parallel_reduce( con_length, KOKKOS_LAMBDA(const int i, T& inner) { - inner = Kokkos::atomic_max_fetch(&(input(0)), inner + 1); + inner = Kokkos::atomic_max_fetch(&(input(0)), + Kokkos::min(inner, max - 1) + 1); if (i == con_length - 1) { Kokkos::atomic_max_fetch(&(input(0)), max); inner = max; @@ -223,7 +224,8 @@ double atomic_contentious_min_replacement(benchmark::State& state, Kokkos::parallel_reduce( con_length, KOKKOS_LAMBDA(const int i, T& inner) { - inner = Kokkos::atomic_min_fetch(&(input(0)), inner - 1); + inner = Kokkos::atomic_min_fetch(&(input(0)), + Kokkos::max(inner, min + 1) - 1); if (i == con_length - 1) { Kokkos::atomic_min_fetch(&(input(0)), min); inner = min; @@ -246,7 +248,7 @@ static void Atomic_ContentiousMinReplacements(benchmark::State& state) { auto inp = prepare_input(1, std::numeric_limits::max()); for (auto _ : state) { - const auto time = atomic_contentious_max_replacement(state, inp, length); + const auto time = atomic_contentious_min_replacement(state, inp, length); state.SetIterationTime(time); } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index 276d03da265..fd86976d3ba 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -166,8 +166,17 @@ class Cuda { Cuda(); - Cuda(cudaStream_t stream, - Impl::ManageStream manage_stream = Impl::ManageStream::no); + explicit Cuda(cudaStream_t stream) : Cuda(stream, Impl::ManageStream::no) {} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template + KOKKOS_DEPRECATED_WITH_COMMENT( + "Cuda execution space should be constructed explicitly.") + Cuda(cudaStream_t stream) + : Cuda(stream) {} +#endif + + Cuda(cudaStream_t stream, Impl::ManageStream manage_stream); KOKKOS_DEPRECATED Cuda(cudaStream_t stream, bool manage_stream); @@ -186,7 +195,7 @@ class Cuda { /// /// This matches the __CUDA_ARCH__ specification. KOKKOS_DEPRECATED static size_type device_arch() { - const cudaDeviceProp& cudaProp = Cuda().cuda_device_prop(); + const cudaDeviceProp cudaProp = Cuda().cuda_device_prop(); return cudaProp.major * 100 + cudaProp.minor; } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 0944937e1bf..75318aff778 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -59,12 +59,6 @@ const std::unique_ptr &Kokkos::Impl::cuda_get_deep_copy_space( namespace Kokkos { namespace Impl { -namespace { - -static std::atomic num_uvm_allocations(0); - -} // namespace - void DeepCopyCuda(void *dst, const void *src, size_t n) { KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( dst, src, n, cudaMemcpyDefault))); @@ -204,10 +198,7 @@ void *impl_allocate_common(const int device_id, // we should do here since we're turning it into an // exception here cudaGetLastError(); - throw Experimental::CudaRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - CudaMalloc); + Kokkos::Impl::throw_bad_alloc(, arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { @@ -252,8 +243,6 @@ void *CudaUVMSpace::impl_allocate( Cuda::impl_static_fence( "Kokkos::CudaUVMSpace::impl_allocate: Pre UVM Allocation"); if (arg_alloc_size > 0) { - Kokkos::Impl::num_uvm_allocations++; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); cudaError_t error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); @@ -263,10 +252,7 @@ void *CudaUVMSpace::impl_allocate( // we should do here since we're turning it into an // exception here cudaGetLastError(); - throw Experimental::CudaRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - CudaMallocManaged); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST @@ -307,10 +293,7 @@ void *CudaHostPinnedSpace::impl_allocate( // we should do here since we're turning it into an // exception here cudaGetLastError(); - throw Experimental::CudaRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - CudaHostAlloc); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = @@ -341,27 +324,24 @@ void CudaSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } - try { #ifndef CUDART_VERSION #error CUDART_VERSION undefined! #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - if (arg_alloc_size >= memory_threshold_g) { - Impl::cuda_device_synchronize( - "Kokkos::Cuda: backend fence before async free"); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, m_stream)); - Impl::cuda_device_synchronize( - "Kokkos::Cuda: backend fence after async free"); - } else { - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); - } -#else + if (arg_alloc_size >= memory_threshold_g) { + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence before async free"); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, m_stream)); + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence after async free"); + } else { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); -#endif - } catch (...) { } +#else + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); +#endif } void CudaUVMSpace::deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const { @@ -387,13 +367,9 @@ void CudaUVMSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } - try { - if (arg_alloc_ptr != nullptr) { - Kokkos::Impl::num_uvm_allocations--; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); - } - } catch (...) { + if (arg_alloc_ptr != nullptr) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } Cuda::impl_static_fence( "Kokkos::CudaUVMSpace::impl_deallocate: Post UVM Deallocation"); @@ -420,11 +396,8 @@ void CudaHostPinnedSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } - try { - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); - } catch (...) { - } + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); } } // namespace Kokkos diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp index c4458c910ca..66656fefda5 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp @@ -22,7 +22,6 @@ #include #include -#include namespace Kokkos { namespace Impl { @@ -69,52 +68,6 @@ inline void cuda_internal_safe_call(cudaError e, const char* name, Kokkos::Impl::cuda_internal_safe_call(call, #call, __FILE__, __LINE__) } // namespace Impl - -namespace Experimental { - -class CudaRawMemoryAllocationFailure : public RawMemoryAllocationFailure { - private: - using base_t = RawMemoryAllocationFailure; - - cudaError_t m_error_code = cudaSuccess; - - static FailureMode get_failure_mode(cudaError_t error_code) { - switch (error_code) { - case cudaErrorMemoryAllocation: return FailureMode::OutOfMemoryError; - case cudaErrorInvalidValue: return FailureMode::InvalidAllocationSize; - // TODO handle cudaErrorNotSupported for cudaMallocManaged - default: return FailureMode::Unknown; - } - } - - public: - // using base_t::base_t; - // would trigger - // - // error: cannot determine the exception specification of the default - // constructor due to a circular dependency - // - // using NVCC 9.1 and gcc 7.4 - CudaRawMemoryAllocationFailure( - size_t arg_attempted_size, size_t arg_attempted_alignment, - FailureMode arg_failure_mode = FailureMode::OutOfMemoryError, - AllocationMechanism arg_mechanism = - AllocationMechanism::StdMalloc) noexcept - : base_t(arg_attempted_size, arg_attempted_alignment, arg_failure_mode, - arg_mechanism) {} - - CudaRawMemoryAllocationFailure(size_t arg_attempted_size, - cudaError_t arg_error_code, - AllocationMechanism arg_mechanism) noexcept - : base_t(arg_attempted_size, /* CudaSpace doesn't handle alignment? */ 1, - get_failure_mode(arg_error_code), arg_mechanism), - m_error_code(arg_error_code) {} - - void append_additional_error_information(std::ostream& o) const override; -}; - -} // end namespace Experimental - } // namespace Kokkos #endif // KOKKOS_ENABLE_CUDA diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index fcc3ff04ff5..625d8c317a1 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -72,7 +72,7 @@ struct GraphImpl { GraphNodeImpl; - // Not moveable or copyable; it spends its whole life as a shared_ptr in the + // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object GraphImpl() = delete; GraphImpl(GraphImpl const&) = delete; @@ -115,12 +115,9 @@ struct GraphImpl { template // requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl - // Also requires that the kernel has the graph node tag in it's policy + // Also requires that the kernel has the graph node tag in its policy void add_node(std::shared_ptr const& arg_node_ptr) { - static_assert( - NodeImpl::kernel_type::Policy::is_graph_kernel::value, - "Something has gone horribly wrong, but it's too complicated to " - "explain here. Buy Daisy a coffee and she'll explain it to you."); + static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(bool(arg_node_ptr)); // The Kernel launch from the execute() method has been shimmed to insert // the node into the graph diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 849e8b3b30e..89a00028969 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -737,6 +737,14 @@ namespace Impl { int g_cuda_space_factory_initialized = initialize_space_factory("150_Cuda"); +int CudaInternal::m_cudaArch = -1; +cudaDeviceProp CudaInternal::m_deviceProp; +std::set CudaInternal::cuda_devices = {}; +std::map CudaInternal::constantMemHostStagingPerDevice = + {}; +std::map CudaInternal::constantMemReusablePerDevice = {}; +std::map CudaInternal::constantMemMutexPerDevice = {}; + } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 24f4af31019..ffaa0f54749 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -91,10 +91,10 @@ class CudaInternal { int m_cudaDev = -1; // Device Properties - inline static int m_cudaArch = -1; + static int m_cudaArch; static int concurrency(); - inline static cudaDeviceProp m_deviceProp; + static cudaDeviceProp m_deviceProp; // Scratch Spaces for Reductions mutable std::size_t m_scratchSpaceCount; @@ -120,11 +120,10 @@ class CudaInternal { bool was_initialized = false; bool was_finalized = false; - inline static std::set cuda_devices = {}; - inline static std::map constantMemHostStagingPerDevice = - {}; - inline static std::map constantMemReusablePerDevice = {}; - inline static std::map constantMemMutexPerDevice = {}; + static std::set cuda_devices; + static std::map constantMemHostStagingPerDevice; + static std::map constantMemReusablePerDevice; + static std::map constantMemMutexPerDevice; static CudaInternal& singleton(); @@ -421,23 +420,6 @@ class CudaInternal { return cudaStreamSynchronize(stream); } - // The following are only available for cuda 11.2 and greater -#if (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - template - cudaError_t cuda_malloc_async_wrapper(void** devPtr, size_t size, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaMallocAsync(devPtr, size, get_input_stream(hStream)); - } - - template - cudaError_t cuda_free_async_wrapper(void* devPtr, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaFreeAsync(devPtr, get_input_stream(hStream)); - } -#endif - // C++ API routines template cudaError_t cuda_func_get_attributes_wrapper(cudaFuncAttributes* attr, diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 9f7be45c839..71e77518210 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -539,17 +539,9 @@ class ParallelFor, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance =; - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes( - internal_space_instance->m_cudaDev); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Impl::cuda_get_opt_block_size( - internal_space_instance, attr, m_functor, m_vector_size, - m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor, ParallelForTag()); m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -585,13 +577,7 @@ class ParallelFor, "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); } - if (int(m_team_size) > - int(Kokkos::Impl::cuda_get_max_block_size( - internal_space_instance, attr, arg_functor, - arg_policy.impl_vector_length(), - arg_policy.team_scratch_size(0), - arg_policy.thread_scratch_size(0)) / - arg_policy.impl_vector_length())) { + if (m_team_size > arg_policy.team_size_max(arg_functor, ParallelForTag())) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); } @@ -909,17 +895,11 @@ class ParallelReduce:: - get_cuda_func_attributes(internal_space_instance->m_cudaDev); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Impl::cuda_get_opt_block_size( - internal_space_instance, attr, - m_functor_reducer.get_functor(), m_vector_size, - m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), + arg_functor_reducer.get_reducer(), + ParallelReduceTag()); m_team_begin = UseShflReduction diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp index c8d6641d1ee..18aca15065e 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp @@ -28,35 +28,20 @@ extern "C" { /* Cuda runtime function, declared in * Requires capability 2.x or better. */ -extern __device__ void __assertfail(const void *message, const void *file, - unsigned int line, const void *function, - size_t charsize); +[[noreturn]] __device__ void __assertfail(const void *message, const void *file, + unsigned int line, + const void *function, + size_t charsize); } namespace Kokkos { namespace Impl { -// required to workaround failures in random number generator unit tests with -// pre-volta architectures -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) -__device__ inline void cuda_abort(const char *const message) { -#else -[[noreturn]] __device__ inline void cuda_abort(const char *const message) { -#endif +[[noreturn]] __device__ static void cuda_abort(const char *const message) { const char empty[] = ""; __assertfail((const void *)message, (const void *)empty, (unsigned int)0, (const void *)empty, sizeof(char)); - - // This loop is never executed. It's intended to suppress warnings that the - // function returns, even though it does not. This is necessary because - // __assertfail is not marked as [[noreturn]], even though it does not return. - // Disable with KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK to workaround failures - // in random number generator unit tests with pre-volta architectures -#if !defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - while (true) - ; -#endif } } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp index 3a88e97ee3d..439075fc6cc 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp @@ -48,8 +48,19 @@ class HIP { using scratch_memory_space = ScratchMemorySpace; HIP(); - HIP(hipStream_t stream, - Impl::ManageStream manage_stream = Impl::ManageStream::no); + + explicit HIP(hipStream_t stream) : HIP(stream, Impl::ManageStream::no) {} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template + KOKKOS_DEPRECATED_WITH_COMMENT( + "HIP execution space should be constructed explicitly.") + HIP(hipStream_t stream) + : HIP(stream) {} +#endif + + HIP(hipStream_t stream, Impl::ManageStream manage_stream); + KOKKOS_DEPRECATED HIP(hipStream_t stream, bool manage_stream); //@} diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp index 43d63c090b3..fa45dcfec31 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp @@ -22,8 +22,6 @@ #include -#include - namespace Kokkos { namespace Impl { @@ -44,39 +42,4 @@ inline void hip_internal_safe_call(hipError_t e, const char* name, #define KOKKOS_IMPL_HIP_SAFE_CALL(call) \ Kokkos::Impl::hip_internal_safe_call(call, #call, __FILE__, __LINE__) -namespace Kokkos { -namespace Experimental { - -class HIPRawMemoryAllocationFailure : public RawMemoryAllocationFailure { - private: - hipError_t m_error_code = hipSuccess; - - static FailureMode get_failure_mode(hipError_t error_code) { - switch (error_code) { - case hipErrorMemoryAllocation: return FailureMode::OutOfMemoryError; - case hipErrorInvalidValue: return FailureMode::InvalidAllocationSize; - default: return FailureMode::Unknown; - } - } - - public: - HIPRawMemoryAllocationFailure(size_t arg_attempted_size, - hipError_t arg_error_code, - AllocationMechanism arg_mechanism) noexcept - : RawMemoryAllocationFailure( - arg_attempted_size, /* HIPSpace doesn't handle alignment? */ 1, - get_failure_mode(arg_error_code), arg_mechanism), - m_error_code(arg_error_code) {} - - void append_additional_error_information(std::ostream& o) const override { - if (m_error_code != hipSuccess) { - o << " The HIP allocation returned the error code \"" - << hipGetErrorName(m_error_code) << "\"."; - } - } -}; - -} // namespace Experimental -} // namespace Kokkos - #endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index 7cc06d02fbe..a0989fe6711 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -40,7 +40,7 @@ class GraphImpl { GraphNodeImpl; - // Not moveable or copyable; it spends its whole life as a shared_ptr in the + // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. GraphImpl() = delete; GraphImpl(GraphImpl const&) = delete; @@ -108,7 +108,7 @@ inline void GraphImpl::add_node( } // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl -// Also requires that the kernel has the graph node tag in it's policy +// Also requires that the kernel has the graph node tag in its policy template inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 22c0db047f6..e0b25c69399 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -353,6 +353,22 @@ void HIPInternal::finalize() { m_num_scratch_locks = 0; } +int HIPInternal::m_hipDev = -1; +unsigned HIPInternal::m_multiProcCount = 0; +unsigned HIPInternal::m_maxWarpCount = 0; +std::array HIPInternal::m_maxBlock = {0, 0, 0}; +unsigned HIPInternal::m_maxWavesPerCU = 0; +int HIPInternal::m_shmemPerSM = 0; +int HIPInternal::m_maxShmemPerBlock = 0; +int HIPInternal::m_maxThreadsPerSM = 0; + +hipDeviceProp_t HIPInternal::m_deviceProp; + +std::mutex HIPInternal::scratchFunctorMutex; +unsigned long *HIPInternal::constantMemHostStaging = nullptr; +hipEvent_t HIPInternal::constantMemReusable = nullptr; +std::mutex HIPInternal::constantMemMutex; + //---------------------------------------------------------------------------- Kokkos::HIP::size_type hip_internal_multiprocessor_count() { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 7b55f519c2a..19349e90bb1 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -35,8 +35,7 @@ struct HIPTraits { static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ -#elif defined(KOKKOS_ARCH_AMD_GFX1030) || defined(KOKKOS_ARCH_AMD_GFX1100) || \ - defined(KOKKOS_ARCH_AMD_GFX1103) +#elif defined(KOKKOS_ARCH_AMD_GFX1030) || defined(KOKKOS_ARCH_AMD_GFX1100) static constexpr int WarpSize = 32; static constexpr int WarpIndexMask = 0x001f; /* hexadecimal for 31 */ static constexpr int WarpIndexShift = 5; /* WarpSize == 1 << WarpShift*/ @@ -71,16 +70,16 @@ class HIPInternal { public: using size_type = ::Kokkos::HIP::size_type; - inline static int m_hipDev = -1; - inline static unsigned m_multiProcCount = 0; - inline static unsigned m_maxWarpCount = 0; - inline static std::array m_maxBlock = {0, 0, 0}; - inline static unsigned m_maxWavesPerCU = 0; - inline static int m_shmemPerSM = 0; - inline static int m_maxShmemPerBlock = 0; - inline static int m_maxThreadsPerSM = 0; + static int m_hipDev; + static unsigned m_multiProcCount; + static unsigned m_maxWarpCount; + static std::array m_maxBlock; + static unsigned m_maxWavesPerCU; + static int m_shmemPerSM; + static int m_maxShmemPerBlock; + static int m_maxThreadsPerSM; - inline static hipDeviceProp_t m_deviceProp; + static hipDeviceProp_t m_deviceProp; static int concurrency(); @@ -93,7 +92,7 @@ class HIPInternal { size_type *m_scratchFlags = nullptr; mutable size_type *m_scratchFunctor = nullptr; mutable size_type *m_scratchFunctorHost = nullptr; - inline static std::mutex scratchFunctorMutex; + static std::mutex scratchFunctorMutex; hipStream_t m_stream = nullptr; uint32_t m_instance_id = @@ -112,9 +111,9 @@ class HIPInternal { // FIXME_HIP: these want to be per-device, not per-stream... use of 'static' // here will break once there are multiple devices though - inline static unsigned long *constantMemHostStaging = nullptr; - inline static hipEvent_t constantMemReusable = nullptr; - inline static std::mutex constantMemMutex; + static unsigned long *constantMemHostStaging; + static hipEvent_t constantMemReusable; + static std::mutex constantMemMutex; static HIPInternal &singleton(); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp index 55b6218d1c8..16295116462 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp @@ -50,6 +50,7 @@ class ParallelReduce class ParallelReduce, HIP> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; @@ -46,6 +46,7 @@ class ParallelReduce is_first_hip_managed_allocation(true); @@ -66,7 +67,6 @@ void* HIPSpace::allocate( return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); } void* HIPSpace::impl_allocate( - const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { @@ -77,10 +77,7 @@ void* HIPSpace::impl_allocate( // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here (void)hipGetLastError(); - throw Experimental::HIPRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - HIPMalloc); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = @@ -111,10 +108,7 @@ void* HIPHostPinnedSpace::impl_allocate( // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here (void)hipGetLastError(); - throw Experimental::HIPRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - HIPHostMalloc); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = @@ -178,10 +172,7 @@ Kokkos::HIP::runtime WARNING: Kokkos did not find an environment variable 'HSA_X // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here (void)hipGetLastError(); - throw Experimental::HIPRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - HIPMallocManaged); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } KOKKOS_IMPL_HIP_SAFE_CALL(hipMemAdvise( ptr, arg_alloc_size, hipMemAdviseSetCoarseGrain, m_device)); diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp index 6d541a64148..1f3d0783449 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp @@ -153,7 +153,7 @@ void HPX::impl_instance_fence_locked(const std::string &name) const { auto &s = impl_get_sender(); hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } @@ -184,7 +184,7 @@ void HPX::impl_static_fence(const std::string &name) { } hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp index 26181a7c05d..245dc128ca8 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -168,17 +168,31 @@ class HPX { : m_instance_data(Kokkos::Impl::HostSharedPtr( &m_default_instance_data, &default_instance_deleter)) {} ~HPX() = default; - HPX(instance_mode mode) + explicit HPX(instance_mode mode) : m_instance_data( mode == instance_mode::independent ? (Kokkos::Impl::HostSharedPtr( new instance_data(m_next_instance_id++))) : Kokkos::Impl::HostSharedPtr( &m_default_instance_data, &default_instance_deleter)) {} - HPX(hpx::execution::experimental::unique_any_sender<> &&sender) + explicit HPX(hpx::execution::experimental::unique_any_sender<> &&sender) : m_instance_data(Kokkos::Impl::HostSharedPtr( new instance_data(m_next_instance_id++, std::move(sender)))) {} +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template + KOKKOS_DEPRECATED_WITH_COMMENT( + "HPX execution space should be constructed explicitly.") + HPX(instance_mode mode) + : HPX(mode) {} + + template + KOKKOS_DEPRECATED_WITH_COMMENT( + "HPX execution space should be constructed explicitly.") + HPX(hpx::execution::experimental::unique_any_sender<> &&sender) + : HPX(std::move(sender)) {} +#endif + HPX(HPX &&other) = default; HPX(const HPX &other) = default; diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp index ba1626bb72e..4d905fbc553 100644 --- a/lib/kokkos/core/src/Kokkos_Array.hpp +++ b/lib/kokkos/core/src/Kokkos_Array.hpp @@ -29,7 +29,6 @@ #include #include #include -#include #include namespace Kokkos { @@ -80,7 +79,11 @@ struct ArrayBoundsCheck { /**\brief Derived from the C++17 'std::array'. * Dropping the iterator interface. */ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template +#else +template +#endif struct Array { public: /** @@ -129,10 +132,38 @@ struct Array { KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { return &m_internal_implementation_private_member_data[0]; } + + friend KOKKOS_FUNCTION constexpr bool operator==(Array const& lhs, + Array const& rhs) noexcept { + for (size_t i = 0; i != N; ++i) + if (lhs[i] != rhs[i]) return false; + return true; + } + + friend KOKKOS_FUNCTION constexpr bool operator!=(Array const& lhs, + Array const& rhs) noexcept { + return !(lhs == rhs); + } + + private: + template + friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< + Impl::is_swappable::value> + kokkos_swap(Array& a, + Array& b) noexcept(Impl::is_nothrow_swappable_v) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } + } }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template struct Array { +#else +template +struct Array { +#endif public: using reference = T&; using const_reference = std::add_const_t&; @@ -167,25 +198,35 @@ struct Array { KOKKOS_INLINE_FUNCTION pointer data() { return nullptr; } KOKKOS_INLINE_FUNCTION const_pointer data() const { return nullptr; } - KOKKOS_DEFAULTED_FUNCTION ~Array() = default; - KOKKOS_DEFAULTED_FUNCTION Array() = default; - KOKKOS_DEFAULTED_FUNCTION Array(const Array&) = default; - KOKKOS_DEFAULTED_FUNCTION Array& operator=(const Array&) = default; + friend KOKKOS_FUNCTION constexpr bool operator==(Array const&, + Array const&) noexcept { + return true; + } + friend KOKKOS_FUNCTION constexpr bool operator!=(Array const&, + Array const&) noexcept { + return false; + } - // Some supported compilers are not sufficiently C++11 compliant - // for default move constructor and move assignment operator. - // Array( Array && ) = default ; - // Array & operator = ( Array && ) = default ; + private: + friend KOKKOS_INLINE_FUNCTION constexpr void kokkos_swap( + Array&, Array&) noexcept {} }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +struct KokkosArrayContiguous {}; +struct KokkosArrayStrided {}; +} // namespace Impl + template <> -struct Array { - struct contiguous {}; - struct strided {}; +struct KOKKOS_DEPRECATED Array { + using contiguous = Impl::KokkosArrayContiguous; + using strided = Impl::KokkosArrayStrided; }; template -struct Array::contiguous> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; @@ -253,7 +294,8 @@ struct Array::contiguous> { }; template -struct Array::strided> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; @@ -320,10 +362,37 @@ struct Array::strided> { size_type arg_stride) : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +#endif template Array(T, Us...)->Array; +namespace Impl { + +template +KOKKOS_FUNCTION constexpr Array, N> to_array_impl( + T (&a)[N], std::index_sequence) { + return {{a[I]...}}; +} + +template +KOKKOS_FUNCTION constexpr Array, N> to_array_impl( + T(&&a)[N], std::index_sequence) { + return {{std::move(a[I])...}}; +} + +} // namespace Impl + +template +KOKKOS_FUNCTION constexpr auto to_array(T (&a)[N]) { + return Impl::to_array_impl(a, std::make_index_sequence{}); +} + +template +KOKKOS_FUNCTION constexpr auto to_array(T(&&a)[N]) { + return Impl::to_array_impl(std::move(a), std::make_index_sequence{}); +} + } // namespace Kokkos // @@ -333,6 +402,7 @@ struct std::tuple_size> template struct std::tuple_element> { + static_assert(I < N); using type = T; }; @@ -340,21 +410,25 @@ namespace Kokkos { template KOKKOS_FUNCTION constexpr T& get(Array& a) noexcept { + static_assert(I < N); return a[I]; } template KOKKOS_FUNCTION constexpr T const& get(Array const& a) noexcept { + static_assert(I < N); return a[I]; } template KOKKOS_FUNCTION constexpr T&& get(Array&& a) noexcept { + static_assert(I < N); return std::move(a[I]); } template KOKKOS_FUNCTION constexpr T const&& get(Array const&& a) noexcept { + static_assert(I < N); return std::move(a[I]); } diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp index 9acacef901a..bf57dcae650 100644 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp @@ -22,7 +22,6 @@ static_assert(false, #ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ #define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ #include -#include #include #ifdef KOKKOS_ENABLE_ATOMICS_BYPASS diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index eebdd20f15d..26db69ac1f1 100644 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -22,8 +22,6 @@ static_assert(false, #ifndef KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_ #define KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_ #include - -#include #include #include diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 4d405116ccf..7dd2a9ddbb7 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -28,6 +28,7 @@ #include #include #include +#include namespace Kokkos { @@ -256,6 +257,12 @@ class return *this; } + template + friend constexpr const RT& get(const complex&) noexcept; + + template + friend constexpr const RT&& get(const complex&&) noexcept; + #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 //! Copy constructor from volatile. template < @@ -423,6 +430,75 @@ class #endif // KOKKOS_ENABLE_DEPRECATED_CODE_4 }; +} // namespace Kokkos + +// Tuple protocol for complex based on (voted into +// the C++26 working draft on 2023-11) + +template +struct std::tuple_size> + : std::integral_constant {}; + +template +struct std::tuple_element> { + static_assert(I < 2); + using type = RealType; +}; + +namespace Kokkos { + +// get<...>(...) defined here so as not to be hidden friends, as per P2819R2 + +template +KOKKOS_FUNCTION constexpr RealType& get(complex& z) noexcept { + static_assert(I < 2); + if constexpr (I == 0) + return z.real(); + else + return z.imag(); +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +template +KOKKOS_FUNCTION constexpr RealType&& get(complex&& z) noexcept { + static_assert(I < 2); + if constexpr (I == 0) + return std::move(z.real()); + else + return std::move(z.imag()); +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +template +KOKKOS_FUNCTION constexpr const RealType& get( + const complex& z) noexcept { + static_assert(I < 2); + if constexpr (I == 0) + return z.re_; + else + return z.im_; +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +template +KOKKOS_FUNCTION constexpr const RealType&& get( + const complex&& z) noexcept { + static_assert(I < 2); + if constexpr (I == 0) + return std::move(z.re_); + else + return std::move(z.im_); +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + //============================================================================== // {{{1 diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp index 08f6ba8d696..e856b192471 100644 --- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp @@ -221,10 +221,12 @@ struct ViewFill { ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, const ExecSpace& space) : a(a_), val(val_) { + // MDRangePolicy is not supported for 7D views + // Iterate separately over extent(2) Kokkos::parallel_for("Kokkos::ViewFill-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, - {a.extent(0), a.extent(1), a.extent(2), - a.extent(3), a.extent(5), a.extent(6)}), + {a.extent(0), a.extent(1), a.extent(3), + a.extent(4), a.extent(5), a.extent(6)}), *this); } @@ -249,6 +251,8 @@ struct ViewFill { ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, const ExecSpace& space) : a(a_), val(val_) { + // MDRangePolicy is not supported for 8D views + // Iterate separately over extent(2) and extent(4) Kokkos::parallel_for("Kokkos::ViewFill-8D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -293,9 +297,11 @@ struct ViewCopy { ViewTypeA a; ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<2, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -323,9 +329,11 @@ struct ViewCopy { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<3, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -354,9 +362,11 @@ struct ViewCopy { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<4, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -386,9 +396,11 @@ struct ViewCopy { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<5, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -418,9 +430,11 @@ struct ViewCopy { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<6, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -450,9 +464,11 @@ struct ViewCopy { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<6, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -461,6 +477,8 @@ struct ViewCopy { ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) : a(a_), b(b_) { + // MDRangePolicy is not supported for 7D views + // Iterate separately over extent(2) Kokkos::parallel_for("Kokkos::ViewCopy-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -483,9 +501,11 @@ struct ViewCopy { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<6, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -494,6 +514,8 @@ struct ViewCopy { ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) : a(a_), b(b_) { + // MDRangePolicy is not supported for 8D views + // Iterate separately over extent(2) and extent(4) Kokkos::parallel_for("Kokkos::ViewCopy-8D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -539,11 +561,8 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (Kokkos::is_layouttiled::value) { - iterate = Kokkos::layout_iterate_type_selector< - typename DstType::array_layout>::outer_iteration_pattern; - } else if (std::is_same::value) { + if (std::is_same::value) { iterate = Kokkos::Iterate::Right; } else if (std::is_same::value) { @@ -630,11 +649,8 @@ void view_copy(const DstType& dst, const SrcType& src) { int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (Kokkos::is_layouttiled::value) { - iterate = Kokkos::layout_iterate_type_selector< - typename DstType::array_layout>::outer_iteration_pattern; - } else if (std::is_same::value) { + if (std::is_same::value) { iterate = Kokkos::Iterate::Right; } else if (std::is_same::value) { @@ -3092,8 +3108,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value> + Kokkos::LayoutStride>::value> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3139,8 +3154,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value)> + Kokkos::LayoutStride>::value)> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3235,7 +3249,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, v = view_type(); // Best effort to deallocate in case no other view refers // to the shared allocation v = view_type(arg_prop_copy, n0, n1, n2, n3, n4, n5, n6, n7); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -3308,8 +3325,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value> + Kokkos::LayoutStride>::value> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3331,7 +3347,10 @@ impl_realloc(Kokkos::View& v, if (v.layout() != layout) { v = view_type(); // Deallocate first, if the only view to allocation v = view_type(arg_prop, layout); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -3351,8 +3370,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value)> + Kokkos::LayoutStride>::value)> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3452,6 +3470,7 @@ struct MirrorType { using view_type = Kokkos::View; }; +// collection of static asserts for create_mirror and create_mirror_view template void check_view_ctor_args_create_mirror() { using alloc_prop_input = Impl::ViewCtorProp; @@ -3470,232 +3489,231 @@ void check_view_ctor_args_create_mirror() { "not explicitly allow padding!"); } +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t::has_memory_space, - typename Kokkos::View::HostMirror> -create_mirror(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - using src_type = View; - using dst_type = typename src_type::HostMirror; - +inline auto create_mirror(const Kokkos::View& src, + const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return dst_type(prop_copy, src.layout()); -} - -// Create a mirror in a new space (specialization for different space) -template ::has_memory_space>> -auto create_mirror(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - check_view_ctor_args_create_mirror(); - - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - using alloc_prop = decltype(prop_copy); - - return typename Impl::MirrorType::view_type(prop_copy, src.layout()); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using memory_space = typename decltype(prop_copy)::memory_space; + using dst_type = + typename Impl::MirrorType::view_type; + return dst_type(prop_copy, src.layout()); + } else { + using dst_type = typename View::HostMirror; + return dst_type(prop_copy, src.layout()); + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } } // namespace Impl -template -std::enable_if_t::specialize>::value, - typename Kokkos::View::HostMirror> -create_mirror(Kokkos::View const& v) { - return Impl::create_mirror(v, Impl::ViewCtorProp<>{}); +// public interface +template ::specialize>>> +auto create_mirror(Kokkos::View const& src) { + return Impl::create_mirror(src, Impl::ViewCtorProp<>{}); } -template -std::enable_if_t::specialize>::value, - typename Kokkos::View::HostMirror> -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, - Kokkos::View const& v) { - return Impl::create_mirror(v, view_alloc(wi)); +// public interface that accepts a without initializing flag +template ::specialize>>> +auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, + Kokkos::View const& src) { + return Impl::create_mirror(src, view_alloc(wi)); } +// public interface that accepts a space template ::value>> -std::enable_if_t::specialize>::value, - typename Impl::MirrorType::view_type> -create_mirror(Space const&, Kokkos::View const& v) { - return Impl::create_mirror(v, view_alloc(typename Space::memory_space{})); + typename Enable = std::enable_if_t< + Kokkos::is_space::value && + std::is_void_v::specialize>>> +auto create_mirror(Space const&, Kokkos::View const& src) { + return Impl::create_mirror(src, view_alloc(typename Space::memory_space{})); } +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template ::specialize>::value && - Impl::ViewCtorProp::has_memory_space>> + typename = std::enable_if_t< + std::is_void_v::specialize>>> auto create_mirror(Impl::ViewCtorProp const& arg_prop, - Kokkos::View const& v) { - return Impl::create_mirror(v, arg_prop); -} - -template -std::enable_if_t< - std::is_void::specialize>::value && - !Impl::ViewCtorProp::has_memory_space, - typename Kokkos::View::HostMirror> -create_mirror(Impl::ViewCtorProp const& arg_prop, - Kokkos::View const& v) { - return Impl::create_mirror(v, arg_prop); + Kokkos::View const& src) { + return Impl::create_mirror(src, arg_prop); } +// public interface that accepts a space and a without initializing flag template ::value>> -std::enable_if_t::specialize>::value, - typename Impl::MirrorType::view_type> -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, - Kokkos::View const& v) { - return Impl::create_mirror(v, view_alloc(typename Space::memory_space{}, wi)); + typename Enable = std::enable_if_t< + Kokkos::is_space::value && + std::is_void_v::specialize>>> +auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, + Kokkos::View const& src) { + return Impl::create_mirror(src, + view_alloc(typename Space::memory_space{}, wi)); } namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::View::memory_space, - typename Kokkos::View::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp&) { - check_view_ctor_args_create_mirror(); - return src; -} +// choose a `Kokkos::create_mirror` adapted for the provided view and the +// provided arguments +template +inline auto choose_create_mirror( + const View& src, const Impl::ViewCtorProp& arg_prop) { + // Due to the fact that users can overload `Kokkos::create_mirror`, but also + // that they may not have implemented all of its different possible + // variations, this function chooses the correct private or public version of + // it to call. + // This helper should be used by any overload of + // `Kokkos::Impl::create_mirror_view`. + + if constexpr (std::is_void_v) { + // if the view is not specialized, just call the Impl function + + // using ADL to find the later defined overload of the function + using namespace Kokkos::Impl; + + return create_mirror(src, arg_prop); + } else { + // otherwise, recreate the public call + using ViewProp = Impl::ViewCtorProp; + + // using ADL to find the later defined overload of the function + using namespace Kokkos; + + if constexpr (sizeof...(ViewCtorArgs) == 0) { + // if there are no view constructor args, call the specific public + // function + return create_mirror(src); + } else if constexpr (sizeof...(ViewCtorArgs) == 1 && + ViewProp::has_memory_space) { + // if there is one view constructor arg and it has a memory space, call + // the specific public function + return create_mirror(typename ViewProp::memory_space{}, src); + } else if constexpr (sizeof...(ViewCtorArgs) == 1 && + !ViewProp::initialize) { + // if there is one view constructor arg and it has a without initializing + // mark, call the specific public function + return create_mirror(typename Kokkos::Impl::WithoutInitializing_t{}, src); + } else if constexpr (sizeof...(ViewCtorArgs) == 2 && + ViewProp::has_memory_space && !ViewProp::initialize) { + // if there is two view constructor args and they have a memory space and + // a without initializing mark, call the specific public function + return create_mirror(typename Kokkos::Impl::WithoutInitializing_t{}, + typename ViewProp::memory_space{}, src); + } else { + // if there are other constructor args, call the generic public function -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -// Create a mirror view in a new space (specialization for same space) -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp&) { - check_view_ctor_args_create_mirror(); - return src; -} + // Beware, there are some libraries using Kokkos that don't implement + // this overload (hence the reason for this present function to exist). + return create_mirror(arg_prop, src); + } + } -// Create a mirror view in a new space (specialization for different space) -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } -} // namespace Impl -template -std::enable_if_t< - std::is_same< - typename Kokkos::View::memory_space, - typename Kokkos::View::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value, - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src) { - return src; +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc +template +inline auto create_mirror_view( + const Kokkos::View& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::View< + T, P...>::HostMirror::data_type>::value) { + check_view_ctor_args_create_mirror(); + return typename Kokkos::View::HostMirror(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorViewType::memory_space, + T, P...>::is_same_memspace) { + check_view_ctor_args_create_mirror(); + return typename Impl::MirrorViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } +} // namespace Impl +// public interface template -std::enable_if_t< - !(std::is_same< - typename Kokkos::View::memory_space, - typename Kokkos::View::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src) { - return Kokkos::create_mirror(src); +auto create_mirror_view(const Kokkos::View& src) { + return Impl::create_mirror_view(src, view_alloc()); } +// public interface that accepts a without initializing flag template -typename Kokkos::View::HostMirror create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, Kokkos::View const& v) { - return Impl::create_mirror_view(v, view_alloc(wi)); +auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, + Kokkos::View const& src) { + return Impl::create_mirror_view(src, view_alloc(wi)); } -// FIXME_C++17 Improve SFINAE here. +// public interface that accepts a space template ::value>> -typename Impl::MirrorViewType::view_type create_mirror_view( - const Space&, const Kokkos::View& src, - std::enable_if_t::is_same_memspace>* = - nullptr) { - return src; -} - -// FIXME_C++17 Improve SFINAE here. -template ::value>> -typename Impl::MirrorViewType::view_type create_mirror_view( - const Space& space, const Kokkos::View& src, - std::enable_if_t::is_same_memspace>* = - nullptr) { - return Kokkos::create_mirror(space, src); +auto create_mirror_view(const Space&, const Kokkos::View& src) { + return Impl::create_mirror_view(src, + view_alloc(typename Space::memory_space())); } +// public interface that accepts a space and a without initializing flag template ::value>> -typename Impl::MirrorViewType::view_type create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, Space const&, - Kokkos::View const& v) { +auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, Space const&, + Kokkos::View const& src) { return Impl::create_mirror_view( - v, view_alloc(typename Space::memory_space{}, wi)); + src, view_alloc(typename Space::memory_space{}, wi)); } -template +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>>> auto create_mirror_view(const Impl::ViewCtorProp& arg_prop, - const Kokkos::View& v) { - return Impl::create_mirror_view(v, arg_prop); + const Kokkos::View& src) { + return Impl::create_mirror_view(src, arg_prop); } -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::View& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { +namespace Impl { + +// collection of static asserts for create_mirror_view_and_copy +template +void check_view_ctor_args_create_mirror_view_and_copy() { using alloc_prop_input = Impl::ViewCtorProp; + static_assert( alloc_prop_input::has_memory_space, "The view constructor arguments passed to " @@ -3708,52 +3726,53 @@ auto create_mirror_view_and_copy( "The view constructor arguments passed to " "Kokkos::create_mirror_view_and_copy must " "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; } -template +} // namespace Impl + +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, - const Kokkos::View& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, + const Kokkos::View& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorViewType::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } +#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC) + __builtin_unreachable(); +#endif } // Previously when using auto here, the intel compiler 19.3 would diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index 5f251eeb26a..b8d7f77deb3 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -40,7 +40,12 @@ struct ParallelReduceTag {}; struct ChunkSize { int value; + explicit ChunkSize(int value_) : value(value_) {} +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template + KOKKOS_DEPRECATED_WITH_COMMENT("ChunkSize should be constructed explicitly.") ChunkSize(int value_) : value(value_) {} +#endif }; /** \brief Execution policy for work over a range of an integral type. @@ -714,6 +719,58 @@ class TeamPolicy } }; +// Execution space not provided deduces to TeamPolicy<> + +TeamPolicy()->TeamPolicy<>; + +TeamPolicy(int, int)->TeamPolicy<>; +TeamPolicy(int, int, int)->TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, int)->TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, int, Kokkos::AUTO_t const&)->TeamPolicy<>; + +// DefaultExecutionSpace deduces to TeamPolicy<> + +TeamPolicy(DefaultExecutionSpace const&, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&) + ->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int) + ->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, + Kokkos::AUTO_t const&) + ->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&) + ->TeamPolicy<>; + +// ES != DefaultExecutionSpace deduces to TeamPolicy + +template >> +TeamPolicy(ES const&, int, int)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, int, int)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) + ->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&)->TeamPolicy; + namespace Impl { template @@ -968,9 +1025,9 @@ struct TeamThreadMDRange, TeamHandle> { static constexpr auto par_vector = Impl::TeamMDRangeParVector::NotParVector; static constexpr Iterate direction = - OuterDir == Iterate::Default - ? layout_iterate_type_selector::outer_iteration_pattern - : iter; + OuterDir == Iterate::Default ? Impl::layout_iterate_type_selector< + ArrayLayout>::outer_iteration_pattern + : iter; template KOKKOS_FUNCTION TeamThreadMDRange(TeamHandleType const& team_, Args&&... args) @@ -983,7 +1040,7 @@ struct TeamThreadMDRange, TeamHandle> { }; template -TeamThreadMDRange(TeamHandle const&, Args&&...) +KOKKOS_DEDUCTION_GUIDE TeamThreadMDRange(TeamHandle const&, Args&&...) ->TeamThreadMDRange, TeamHandle>; template @@ -1004,9 +1061,9 @@ struct ThreadVectorMDRange, TeamHandle> { static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector; static constexpr Iterate direction = - OuterDir == Iterate::Default - ? layout_iterate_type_selector::outer_iteration_pattern - : iter; + OuterDir == Iterate::Default ? Impl::layout_iterate_type_selector< + ArrayLayout>::outer_iteration_pattern + : iter; template KOKKOS_INLINE_FUNCTION ThreadVectorMDRange(TeamHandleType const& team_, @@ -1020,7 +1077,7 @@ struct ThreadVectorMDRange, TeamHandle> { }; template -ThreadVectorMDRange(TeamHandle const&, Args&&...) +KOKKOS_DEDUCTION_GUIDE ThreadVectorMDRange(TeamHandle const&, Args&&...) ->ThreadVectorMDRange, TeamHandle>; template @@ -1041,9 +1098,9 @@ struct TeamVectorMDRange, TeamHandle> { static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector; static constexpr Iterate direction = - iter == Iterate::Default - ? layout_iterate_type_selector::outer_iteration_pattern - : iter; + iter == Iterate::Default ? Impl::layout_iterate_type_selector< + ArrayLayout>::outer_iteration_pattern + : iter; template KOKKOS_INLINE_FUNCTION TeamVectorMDRange(TeamHandleType const& team_, @@ -1057,7 +1114,7 @@ struct TeamVectorMDRange, TeamHandle> { }; template -TeamVectorMDRange(TeamHandle const&, Args&&...) +KOKKOS_DEDUCTION_GUIDE TeamVectorMDRange(TeamHandle const&, Args&&...) ->TeamVectorMDRange, TeamHandle>; template #include #include +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#else +#include +#endif namespace Kokkos { -namespace Experimental { -constexpr ptrdiff_t dynamic_extent = -1; +#ifndef KOKKOS_ENABLE_IMPL_MDSPAN +constexpr size_t dynamic_extent = std::numeric_limits::max(); +#endif -template +namespace Experimental { + +template struct Extents { /* TODO @enhancement flesh this out more */ }; -template +template struct PrependExtent; -template +template struct PrependExtent, NewExtent> { using type = Extents; }; -template +template struct AppendExtent; -template +template struct AppendExtent, NewExtent> { using type = Extents; }; - } // end namespace Experimental namespace Impl { @@ -75,33 +82,32 @@ struct _parse_impl { // We have to treat the case of int**[x] specially, since it *doesn't* go // backwards -template +template struct _parse_impl, std::enable_if_t<_all_remaining_extents_dynamic::value>> - : _parse_impl> { -}; + : _parse_impl> {}; // int*(*[x])[y] should still work also (meaning int[][x][][y]) -template +template struct _parse_impl< T*, Kokkos::Experimental::Extents, std::enable_if_t::value>> { using _next = Kokkos::Experimental::AppendExtent< typename _parse_impl, void>::type, - Kokkos::Experimental::dynamic_extent>; + Kokkos::dynamic_extent>; using type = typename _next::type; }; -template +template struct _parse_impl, void> - : _parse_impl< - T, Kokkos::Experimental::Extents // TODO @pedantic this - // could be a - // narrowing cast - > {}; + : _parse_impl // TODO @pedantic + // this could be a + // narrowing cast + > {}; } // end namespace _parse_view_extents_impl @@ -111,38 +117,34 @@ struct ParseViewExtents { DataType, Kokkos::Experimental::Extents<>>::type; }; -template +template struct ApplyExtent { using type = ValueType[Ext]; }; template -struct ApplyExtent { +struct ApplyExtent { using type = ValueType*; }; -template +template struct ApplyExtent { using type = typename ApplyExtent::type[N]; }; -template +template struct ApplyExtent { using type = ValueType * [Ext]; }; template -struct ApplyExtent { - using type = - typename ApplyExtent::type*; +struct ApplyExtent { + using type = typename ApplyExtent::type*; }; template -struct ApplyExtent { - using type = - typename ApplyExtent::type[N]; +struct ApplyExtent { + using type = typename ApplyExtent::type[N]; }; } // end namespace Impl diff --git a/lib/kokkos/core/src/Kokkos_Graph.hpp b/lib/kokkos/core/src/Kokkos_Graph.hpp index 643bdcc02cc..9cc6650e26e 100644 --- a/lib/kokkos/core/src/Kokkos_Graph.hpp +++ b/lib/kokkos/core/src/Kokkos_Graph.hpp @@ -167,6 +167,9 @@ Graph create_graph(Closure&& arg_closure) { #include #endif #endif +#ifdef SYCL_EXT_ONEAPI_GRAPH +#include +#endif #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index a1fb0f5a677..8b5f29f95b2 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -113,7 +113,6 @@ class HostSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -124,7 +123,6 @@ class HostSpace { const Kokkos::Tools::SpaceHandle = Kokkos::Tools::make_space_handle(name())) const; - public: /**\brief Return Name of the MemorySpace */ static constexpr const char* name() { return m_name; } diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp index ca4d956784c..37b80e54a85 100644 --- a/lib/kokkos/core/src/Kokkos_Layout.hpp +++ b/lib/kokkos/core/src/Kokkos_Layout.hpp @@ -217,81 +217,12 @@ enum class Iterate { Right // Right indices stride fastest }; -// To check for LayoutTiled -// This is to hide extra compile-time 'identifier' info within the LayoutTiled -// class by not relying on template specialization to include the ArgN*'s -template -struct is_layouttiled : std::false_type {}; - -template -struct is_layouttiled> - : std::true_type {}; - -namespace Experimental { - -/// LayoutTiled -// Must have Rank >= 2 -template < - Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2 = 0, unsigned ArgN3 = 0, unsigned ArgN4 = 0, - unsigned ArgN5 = 0, unsigned ArgN6 = 0, unsigned ArgN7 = 0, - bool IsPowerOfTwo = - (Kokkos::Impl::is_integral_power_of_two(ArgN0) && - Kokkos::Impl::is_integral_power_of_two(ArgN1) && - (Kokkos::Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0)))> -struct LayoutTiled { - static_assert(IsPowerOfTwo, - "LayoutTiled must be given power-of-two tile dimensions"); - - using array_layout = LayoutTiled; - static constexpr Iterate outer_pattern = OuterP; - static constexpr Iterate inner_pattern = InnerP; - - enum { N0 = ArgN0 }; - enum { N1 = ArgN1 }; - enum { N2 = ArgN2 }; - enum { N3 = ArgN3 }; - enum { N4 = ArgN4 }; - enum { N5 = ArgN5 }; - enum { N6 = ArgN6 }; - enum { N7 = ArgN7 }; - - size_t dimension[ARRAY_LAYOUT_MAX_RANK]; - - enum : bool { is_extent_constructible = true }; - - LayoutTiled(LayoutTiled const&) = default; - LayoutTiled(LayoutTiled&&) = default; - LayoutTiled& operator=(LayoutTiled const&) = default; - LayoutTiled& operator=(LayoutTiled&&) = default; - - KOKKOS_INLINE_FUNCTION - explicit constexpr LayoutTiled(size_t argN0 = 0, size_t argN1 = 0, - size_t argN2 = 0, size_t argN3 = 0, - size_t argN4 = 0, size_t argN5 = 0, - size_t argN6 = 0, size_t argN7 = 0) - : dimension{argN0, argN1, argN2, argN3, argN4, argN5, argN6, argN7} {} - - friend bool operator==(const LayoutTiled& left, const LayoutTiled& right) { - for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) - if (left.dimension[rank] != right.dimension[rank]) return false; - return true; - } - - friend bool operator!=(const LayoutTiled& left, const LayoutTiled& right) { - return !(left == right); - } -}; - -} // namespace Experimental +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +struct KOKKOS_DEPRECATED is_layouttiled : std::false_type {}; +#endif +namespace Impl { // For use with view_copy template struct layout_iterate_type_selector { @@ -320,42 +251,13 @@ struct layout_iterate_type_selector { static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Default; }; +} // namespace Impl -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; -}; - -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; -}; - -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; -}; - -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; -}; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +using layout_iterate_type_selector KOKKOS_DEPRECATED = + Impl::layout_iterate_type_selector; +#endif } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index b255d2a5195..0a0acd303f5 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -55,9 +55,22 @@ #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H #include +#include #include #endif +#if !defined(KOKKOS_ENABLE_CXX17) +#if __has_include() +#include +#else +#include +#endif +#if defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE < 10 +#error \ + "Compiling with support for C++20 or later requires a libstdc++ version later than 9" +#endif +#endif + //---------------------------------------------------------------------------- /** Pick up compiler specific #define macros: * @@ -332,6 +345,10 @@ #define KOKKOS_DEFAULTED_FUNCTION #endif +#if !defined(KOKKOS_DEDUCTION_GUIDE) +#define KOKKOS_DEDUCTION_GUIDE +#endif + #if !defined(KOKKOS_IMPL_HOST_FUNCTION) #define KOKKOS_IMPL_HOST_FUNCTION #endif @@ -562,8 +579,44 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc)) #endif +// clang-format off +#if defined(__NVCOMPILER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("diag_suppress 1216") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("diag_default 1216") +#elif defined(__EDG__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning push") \ + _Pragma("warning disable 1478") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning pop") +#elif defined(__GNUC__) || defined(__clang__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("GCC diagnostic pop") +#elif defined(_MSC_VER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning(push)") \ + _Pragma("warning(disable: 4996)") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning(pop)") +#else + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif +// clang-format on + #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] +#ifndef KOKKOS_ENABLE_CXX17 +#define KOKKOS_IMPL_ATTRIBUTE_UNLIKELY [[unlikely]] +#else +#define KOKKOS_IMPL_ATTRIBUTE_UNLIKELY +#endif + #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) || \ defined(KOKKOS_COMPILER_NVHPC)) && \ diff --git a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp index 3fead8dd293..19967782e5e 100644 --- a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp +++ b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp @@ -277,12 +277,20 @@ KOKKOS_INLINE_FUNCTION long long abs(long long n) { #endif } KOKKOS_INLINE_FUNCTION float abs(float x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } KOKKOS_INLINE_FUNCTION double abs(double x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } inline long double abs(long double x) { using std::abs; diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp index 9be8d8d7aa1..e569fefc14d 100644 --- a/lib/kokkos/core/src/Kokkos_Pair.hpp +++ b/lib/kokkos/core/src/Kokkos_Pair.hpp @@ -413,12 +413,13 @@ KOKKOS_FORCEINLINE_FUNCTION pair tie(T1& x, T2& y) { return (pair(x, y)); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 // // Specialization of Kokkos::pair for a \c void second argument. This // is not actually a "pair"; it only contains one element, the first. // template -struct pair { +struct KOKKOS_DEPRECATED pair { using first_type = T1; using second_type = void; @@ -448,41 +449,48 @@ struct pair { // Specialization of relational operators for Kokkos::pair. // +#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( const pair& lhs, const pair& rhs) { return lhs.first == rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( const pair& lhs, const pair& rhs) { return !(lhs == rhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( const pair& lhs, const pair& rhs) { return lhs.first < rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( const pair& lhs, const pair& rhs) { return !(rhs < lhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( const pair& lhs, const pair& rhs) { return rhs < lhs; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } +#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif +#endif namespace Impl { template diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp index 484f6c0d5f4..122239df790 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -137,9 +137,9 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy, ExecPolicy inner_policy = policy; Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID); - Kokkos::Impl::shared_allocation_tracking_disable(); - Impl::ParallelFor closure(functor, inner_policy); - Kokkos::Impl::shared_allocation_tracking_enable(); + auto closure = + Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< + Impl::ParallelFor>(functor, inner_policy); closure.execute(); @@ -352,10 +352,10 @@ inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, ExecutionPolicy inner_policy = policy; Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); - Kokkos::Impl::shared_allocation_tracking_disable(); - Impl::ParallelScan closure(functor, - inner_policy); - Kokkos::Impl::shared_allocation_tracking_enable(); + auto closure = + Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< + Impl::ParallelScan>(functor, + inner_policy); closure.execute(); @@ -398,18 +398,19 @@ inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); if constexpr (Kokkos::is_view::value) { - Kokkos::Impl::shared_allocation_tracking_disable(); - Impl::ParallelScanWithTotal - closure(functor, inner_policy, return_value); - Kokkos::Impl::shared_allocation_tracking_enable(); + auto closure = + Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< + Impl::ParallelScanWithTotal>( + functor, inner_policy, return_value); closure.execute(); } else { - Kokkos::Impl::shared_allocation_tracking_disable(); Kokkos::View view(&return_value); - Impl::ParallelScanWithTotal - closure(functor, inner_policy, view); - Kokkos::Impl::shared_allocation_tracking_enable(); + auto closure = + Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< + Impl::ParallelScanWithTotal>(functor, inner_policy, + view); closure.execute(); } diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index d499eba6dcc..53913266f13 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -72,7 +72,7 @@ struct Sum { }; template -Sum(View const&) +KOKKOS_DEDUCTION_GUIDE Sum(View const&) ->Sum::memory_space>; template @@ -117,7 +117,7 @@ struct Prod { }; template -Prod(View const&) +KOKKOS_DEDUCTION_GUIDE Prod(View const&) ->Prod::memory_space>; template @@ -164,7 +164,7 @@ struct Min { }; template -Min(View const&) +KOKKOS_DEDUCTION_GUIDE Min(View const&) ->Min::memory_space>; template @@ -212,7 +212,7 @@ struct Max { }; template -Max(View const&) +KOKKOS_DEDUCTION_GUIDE Max(View const&) ->Max::memory_space>; template @@ -258,7 +258,7 @@ struct LAnd { }; template -LAnd(View const&) +KOKKOS_DEDUCTION_GUIDE LAnd(View const&) ->LAnd::memory_space>; template @@ -305,7 +305,7 @@ struct LOr { }; template -LOr(View const&) +KOKKOS_DEDUCTION_GUIDE LOr(View const&) ->LOr::memory_space>; template @@ -352,7 +352,7 @@ struct BAnd { }; template -BAnd(View const&) +KOKKOS_DEDUCTION_GUIDE BAnd(View const&) ->BAnd::memory_space>; template @@ -399,7 +399,7 @@ struct BOr { }; template -BOr(View const&) +KOKKOS_DEDUCTION_GUIDE BOr(View const&) ->BOr::memory_space>; template @@ -458,7 +458,8 @@ struct MinLoc { }; template -MinLoc(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE MinLoc( + View, Properties...> const&) ->MinLoc, Properties...>::memory_space>; @@ -513,7 +514,8 @@ struct MaxLoc { }; template -MaxLoc(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE MaxLoc( + View, Properties...> const&) ->MaxLoc, Properties...>::memory_space>; @@ -577,7 +579,7 @@ struct MinMax { }; template -MinMax(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE MinMax(View, Properties...> const&) ->MinMax, Properties...>::memory_space>; @@ -646,7 +648,8 @@ struct MinMaxLoc { }; template -MinMaxLoc(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE MinMaxLoc( + View, Properties...> const&) ->MinMaxLoc, Properties...>::memory_space>; @@ -713,7 +716,8 @@ struct MaxFirstLoc { }; template -MaxFirstLoc(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE MaxFirstLoc( + View, Properties...> const&) ->MaxFirstLoc, Properties...>::memory_space>; @@ -782,7 +786,7 @@ struct MaxFirstLocCustomComparator { template -MaxFirstLocCustomComparator( +KOKKOS_DEDUCTION_GUIDE MaxFirstLocCustomComparator( View, Properties...> const&, ComparatorType) ->MaxFirstLocCustomComparator, @@ -846,7 +850,8 @@ struct MinFirstLoc { }; template -MinFirstLoc(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE MinFirstLoc( + View, Properties...> const&) ->MinFirstLoc, Properties...>::memory_space>; @@ -915,7 +920,7 @@ struct MinFirstLocCustomComparator { template -MinFirstLocCustomComparator( +KOKKOS_DEDUCTION_GUIDE MinFirstLocCustomComparator( View, Properties...> const&, ComparatorType) ->MinFirstLocCustomComparator, @@ -990,7 +995,8 @@ struct MinMaxFirstLastLoc { }; template -MinMaxFirstLastLoc(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLoc( + View, Properties...> const&) ->MinMaxFirstLastLoc, Properties...>::memory_space>; @@ -1069,7 +1075,7 @@ struct MinMaxFirstLastLocCustomComparator { template -MinMaxFirstLastLocCustomComparator( +KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLocCustomComparator( View, Properties...> const&, ComparatorType) ->MinMaxFirstLastLocCustomComparator< Scalar, Index, ComparatorType, @@ -1133,7 +1139,8 @@ struct FirstLoc { }; template -FirstLoc(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE FirstLoc( + View, Properties...> const&) ->FirstLoc, Properties...>::memory_space>; @@ -1194,7 +1201,7 @@ struct LastLoc { }; template -LastLoc(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE LastLoc(View, Properties...> const&) ->LastLoc, Properties...>::memory_space>; @@ -1261,7 +1268,8 @@ struct StdIsPartitioned { }; template -StdIsPartitioned(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE StdIsPartitioned( + View, Properties...> const&) ->StdIsPartitioned, Properties...>::memory_space>; @@ -1323,7 +1331,8 @@ struct StdPartitionPoint { }; template -StdPartitionPoint(View, Properties...> const&) +KOKKOS_DEDUCTION_GUIDE StdPartitionPoint( + View, Properties...> const&) ->StdPartitionPoint, Properties...>::memory_space>; @@ -1502,18 +1511,18 @@ struct ParallelReduceAdaptor { using Analysis = FunctorAnalysis; - Kokkos::Impl::shared_allocation_tracking_disable(); - CombinedFunctorReducer functor_reducer( - functor, typename Analysis::Reducer( - ReducerSelector::select(functor, return_value))); - - // FIXME Remove "Wrapper" once all backends implement the new interface - Impl::ParallelReduce::execution_space> - closure(functor_reducer, inner_policy, - return_value_adapter::return_value(return_value, functor)); - Kokkos::Impl::shared_allocation_tracking_enable(); + + using CombinedFunctorReducerType = + CombinedFunctorReducer; + auto closure = construct_with_shared_allocation_tracking_disabled< + Impl::ParallelReduce::execution_space>>( + CombinedFunctorReducerType( + functor, typename Analysis::Reducer( + ReducerSelector::select(functor, return_value))), + inner_policy, + return_value_adapter::return_value(return_value, functor)); closure.execute(); Kokkos::Tools::Impl::end_parallel_reduce( diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 484a0e6f62e..820a40a5f55 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -38,6 +38,8 @@ static_assert(false, #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include +#include +#include #endif #include @@ -372,6 +374,35 @@ struct ViewTraits { //------------------------------------ }; +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template +struct MDSpanViewTraits::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType::type; + using mdspan_layout_type = + typename Impl::LayoutFromArrayLayout::type; + using accessor_type = Impl::SpaceAwareAccessor< + typename Traits::memory_space, + Kokkos::default_accessor>; + using mdspan_type = mdspan; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + /** \class View * \brief View to an array of data. * @@ -522,7 +553,6 @@ constexpr bool is_assignable(const Kokkos::View& dst, //---------------------------------------------------------------------------- #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -923,57 +953,30 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && is_layout_left && (rank_dynamic == 0)), + (2 == rank) && is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), reference_type> operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && is_layout_left && (rank_dynamic != 0)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && is_layout_right && (rank_dynamic == 0)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && is_layout_right && (rank_dynamic != 0)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which @@ -1066,57 +1069,30 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && (2 == rank) && - is_default_map && is_layout_left && (rank_dynamic == 0)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == rank) && - is_default_map && is_layout_left && (rank_dynamic != 0)), + is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), reference_type> access(I0 i0, I1 i1, Is... extra) const { check_access_member_function_valid_args(i0, i1, extra...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == rank) && - is_default_map && is_layout_right && (rank_dynamic == 0)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == rank) && - is_default_map && is_layout_right && (rank_dynamic != 0)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (2 == rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } //------------------------------ @@ -1442,8 +1418,7 @@ class View : public ViewTraits { std::is_same_v || std::is_same_v || - is_layouttiled::value) { + Kokkos::LayoutStride>) { size_t i0 = arg_layout.dimension[0]; size_t i1 = arg_layout.dimension[1]; size_t i2 = arg_layout.dimension[2]; @@ -1495,8 +1470,7 @@ class View : public ViewTraits { std::is_same_v || std::is_same_v || - is_layouttiled::value) { + Kokkos::LayoutStride>) { size_t i0 = arg_layout.dimension[0]; size_t i1 = arg_layout.dimension[1]; size_t i2 = arg_layout.dimension[2]; @@ -1725,6 +1699,79 @@ class View : public ViewTraits { "Layout is not constructible from extent arguments. Use " "overload taking a layout object instead."); } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template ::mdspan_type> + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits::mdspan_type>( + mds.mapping())) { + } + + template + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(!std::is_convertible_v< + Kokkos::mdspan, + typename Impl::MDSpanViewTraits::mdspan_type>) +#endif + View(const Kokkos::mdspan& mds) + : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template ::mdspan_type, + typename = std::enable_if_t, + std::false_type, + std::is_assignable, + ImplNaturalMDSpanType>>::value>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map)}; + } + + template >, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = + typename Impl::MDSpanViewTraits::accessor_type()) { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + using ret_mdspan_type = + mdspan; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN }; template @@ -1878,23 +1925,6 @@ KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, namespace Kokkos { namespace Impl { -inline void shared_allocation_tracking_disable() { - Kokkos::Impl::SharedAllocationRecord::tracking_disable(); -} - -inline void shared_allocation_tracking_enable() { - Kokkos::Impl::SharedAllocationRecord::tracking_enable(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - template struct CommonViewValueType; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp index acc0dcd3c6e..c8a5d28ba83 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp @@ -67,16 +67,7 @@ void *Kokkos::Experimental::OpenACCSpace::impl_allocate( ptr = acc_malloc(arg_alloc_size); if (!ptr) { - size_t alignment = 1; // OpenACC does not handle alignment - using Kokkos::Experimental::RawMemoryAllocationFailure; - auto failure_mode = - arg_alloc_size > 0 - ? RawMemoryAllocationFailure::FailureMode::OutOfMemoryError - : RawMemoryAllocationFailure::FailureMode::InvalidAllocationSize; - auto alloc_mechanism = - RawMemoryAllocationFailure::AllocationMechanism::OpenACCMalloc; - throw RawMemoryAllocationFailure(arg_alloc_size, alignment, failure_mode, - alloc_mechanism); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp index 4fce680aef0..2b98018e3bb 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp @@ -44,10 +44,12 @@ class Kokkos::Impl::ParallelFor, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg =; + auto const a_functor(m_functor); #pragma acc parallel loop gang vector num_gangs(league_size) \ - vector_length(team_size* vector_length) copyin(a_functor) + vector_length(team_size* vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size * team_size * vector_length; i++) { int league_id = i / (team_size * vector_length); typename Policy::member_type team(league_id, league_size, team_size, @@ -145,10 +147,12 @@ class Kokkos::Impl::ParallelFor, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg =; + auto const a_functor(m_functor); #pragma acc parallel loop gang num_gangs(league_size) num_workers(team_size) \ - vector_length(vector_length) copyin(a_functor) + vector_length(vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size; i++) { int league_id = i; typename Policy::member_type team(league_id, league_size, team_size, diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp index 81f2c5c3056..82199d0d72d 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -72,9 +72,28 @@ int OpenMP::concurrency(OpenMP const &instance) { int OpenMP::concurrency() const { return impl_thread_pool_size(); } #endif +void OpenMP::impl_static_fence(std::string const &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + []() { + std::lock_guard lock_all_instances( + Impl::OpenMPInternal::all_instances_mutex); + for (auto *instance_ptr : Impl::OpenMPInternal::all_instances) { + std::lock_guard lock_instance( + instance_ptr->m_instance_mutex); + } + }); +} + void OpenMP::fence(const std::string &name) const { Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {}); + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, + [this]() { + auto *internal_instance = this->impl_internal_space_instance(); + std::lock_guard lock(internal_instance->m_instance_mutex); + }); } bool OpenMP::impl_is_initialized() noexcept { diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index 11292af84ad..a403909f677 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -67,7 +67,15 @@ class OpenMP { OpenMP(); - OpenMP(int pool_size); + explicit OpenMP(int pool_size); + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template + KOKKOS_DEPRECATED_WITH_COMMENT( + "OpenMP execution space should be constructed explicitly.") + OpenMP(int pool_size) + : OpenMP(pool_size) {} +#endif /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; @@ -146,14 +154,6 @@ inline int OpenMP::impl_thread_pool_rank() noexcept { KOKKOS_IF_ON_DEVICE((return -1;)) } -inline void OpenMP::impl_static_fence(std::string const& name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - []() {}); -} - inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { return false; } diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index 32172fbc6c7..0f4c7d60524 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -34,18 +34,8 @@ namespace Kokkos { namespace Impl { -void OpenMPInternal::acquire_lock() { - while (1 == desul::atomic_compare_exchange(&m_pool_mutex, 0, 1, - desul::MemoryOrderAcquire(), - desul::MemoryScopeDevice())) { - // do nothing - } -} - -void OpenMPInternal::release_lock() { - desul::atomic_store(&m_pool_mutex, 0, desul::MemoryOrderRelease(), - desul::MemoryScopeDevice()); -} +std::vector OpenMPInternal::all_instances; +std::mutex OpenMPInternal::all_instances_mutex; void OpenMPInternal::clear_thread_data() { const size_t member_bytes = @@ -123,17 +113,11 @@ void OpenMPInternal::resize_thread_data(size_t pool_reduce_bytes, if (nullptr != m_pool[rank]) { m_pool[rank]->disband_pool(); - space.deallocate(m_pool[rank], old_alloc_bytes); + // impl_deallocate to not fence here + space.impl_deallocate("[unlabeled]", m_pool[rank], old_alloc_bytes); } - void *ptr = nullptr; - try { - ptr = space.allocate(alloc_bytes); - } catch ( - Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - // For now, just rethrow the error message the existing way - Kokkos::Impl::throw_runtime_exception(failure.get_error_message()); - } + void *ptr = space.allocate("Kokkos::OpenMP::scratch_mem", alloc_bytes); m_pool[rank] = new (ptr) HostThreadTeamData(); @@ -304,6 +288,18 @@ void OpenMPInternal::finalize() { } m_initialized = false; + + // guard erasing from all_instances + { + std::scoped_lock lock(all_instances_mutex); + + auto it = std::find(all_instances.begin(), all_instances.end(), this); + if (it == all_instances.end()) + Kokkos::abort( + "Execution space instance to be removed couldn't be found!"); + *it = all_instances.back(); + all_instances.pop_back(); + } } void OpenMPInternal::print_configuration(std::ostream &s) const { diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 35b9aa93ba7..f4a0d3e2012 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -56,7 +56,13 @@ struct OpenMPTraits { class OpenMPInternal { private: OpenMPInternal(int arg_pool_size) - : m_pool_size{arg_pool_size}, m_level{omp_get_level()}, m_pool() {} + : m_pool_size{arg_pool_size}, m_level{omp_get_level()}, m_pool() { + // guard pushing to all_instances + { + std::scoped_lock lock(all_instances_mutex); + all_instances.push_back(this); + } + } ~OpenMPInternal() { clear_thread_data(); } @@ -66,7 +72,6 @@ class OpenMPInternal { int m_pool_size; int m_level; - int m_pool_mutex = 0; HostThreadTeamData* m_pool[OpenMPTraits::MAX_THREAD_COUNT]; @@ -83,12 +88,6 @@ class OpenMPInternal { int thread_pool_size() const { return m_pool_size; } - // Acquire lock used to protect access to m_pool - void acquire_lock(); - - // Release lock used to protect access to m_pool - void release_lock(); - void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, size_t team_shared_bytes, size_t thread_local_bytes); @@ -107,6 +106,11 @@ class OpenMPInternal { bool verify_is_initialized(const char* const label) const; void print_configuration(std::ostream& s) const; + + std::mutex m_instance_mutex; + + static std::vector all_instances; + static std::mutex all_instances_mutex; }; inline bool execute_in_serial(OpenMP const& space = OpenMP()) { @@ -157,7 +161,7 @@ inline std::vector create_OpenMP_instances( "Kokkos::abort: Partition not enough resources left to create the last " "instance."); } - instances[weights.size() - 1] = resources_left; + instances[weights.size() - 1] = OpenMP(resources_left); return instances; } diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp index 823a7e668e5..79d7d295c0e 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp @@ -108,6 +108,8 @@ class ParallelFor, Kokkos::OpenMP> { public: inline void execute() const { + // Serialize kernels on the same execution space instance + std::lock_guard lock(m_instance->m_instance_mutex); if (execute_in_serial( { exec_range(m_functor, m_policy.begin(), m_policy.end()); return; @@ -202,6 +204,9 @@ class ParallelFor, public: inline void execute() const { + // Serialize kernels on the same execution space instance + std::lock_guard lock(m_instance->m_instance_mutex); + #ifndef KOKKOS_COMPILER_INTEL if (execute_in_serial( { exec_range(0, m_iter.m_rp.m_num_tiles); @@ -333,7 +338,8 @@ class ParallelFor, const size_t team_shared_size = m_shmem_size; const size_t thread_local_size = 0; // Never shrinks - m_instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -343,8 +349,6 @@ class ParallelFor, m_functor, *(m_instance->get_thread_data()), 0, m_policy.league_size(), m_policy.league_size()); - m_instance->release_lock(); - return; } @@ -383,8 +387,6 @@ class ParallelFor, data.disband_team(); } - - m_instance->release_lock(); } inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp index 05fd1c9dce3..d22e1e7eda0 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp @@ -83,7 +83,8 @@ class ParallelReduce, const size_t pool_reduce_bytes = reducer.value_size(); - m_instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes , @@ -106,6 +107,7 @@ class ParallelReduce, update);; + return; } const int pool_size = m_instance->thread_pool_size(); @@ -157,8 +159,6 @@ class ParallelReduce, m_result_ptr[j] = ptr[j]; } } - - m_instance->release_lock(); } //---------------------------------------- @@ -218,7 +218,8 @@ class ParallelReduceacquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes , @@ -241,8 +242,6 @@ class ParallelReducerelease_lock(); - return; } #endif @@ -299,8 +298,6 @@ class ParallelReducerelease_lock(); } //---------------------------------------- @@ -415,7 +412,8 @@ class ParallelReduceacquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -433,8 +431,6 @@ class ParallelReducerelease_lock(); - return; } @@ -510,8 +506,6 @@ class ParallelReducerelease_lock(); } //---------------------------------------- diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp index f843aef3a84..b9ce25d3ee5 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp @@ -70,6 +70,9 @@ class ParallelScan, const int value_count = Analysis::value_count(m_functor); const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor); + // Serialize kernels on the same execution space instance + std::lock_guard lock(m_instance->m_instance_mutex); + m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes , 0 // team_shared_bytes @@ -193,7 +196,8 @@ class ParallelScanWithTotal, const int value_count = Analysis::value_count(m_functor); const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor); - m_instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes , @@ -213,8 +217,6 @@ class ParallelScanWithTotal, *m_result_ptr = update; - m_instance->release_lock(); - return; } @@ -266,8 +268,6 @@ class ParallelScanWithTotal, *m_result_ptr = update_base; } } - - m_instance->release_lock(); } //---------------------------------------- diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp index 3e67d8d6252..54c1574d71d 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp @@ -52,18 +52,7 @@ HostThreadTeamDataSingleton::HostThreadTeamDataSingleton() num_pool_reduce_bytes, num_team_reduce_bytes, num_team_shared_bytes, num_thread_local_bytes); - void* ptr = nullptr; - try { - ptr = space.allocate(alloc_bytes); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& f) { - // For now, just rethrow the error message with a note - // Note that this could, in turn, trigger an out of memory exception, - // but it's pretty unlikely, so we won't worry about it for now. - // TODO reasonable error message when `std::string` causes OOM error - Kokkos::Impl::throw_runtime_exception( - std::string("Failure to allocate scratch memory: ") + - f.get_error_message()); - } + void* ptr = space.allocate("Kokkos::Impl::HostThreadTeamData", alloc_bytes); HostThreadTeamData::scratch_assign( ptr, alloc_bytes, num_pool_reduce_bytes, num_team_reduce_bytes, diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index 01b66948654..2877d940faf 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -73,7 +73,8 @@ class TaskQueueSpecialization> { execution_space().impl_internal_space_instance(); const int pool_size = get_max_team_count(scheduler.get_execution_space()); - instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard lock(instance->m_instance_mutex); // TODO @tasking @new_feature DSH allow team sizes other than 1 const int team_size = 1; // Threads per core @@ -152,8 +153,6 @@ class TaskQueueSpecialization> { } self.disband_team(); } // end pragma omp parallel - - instance->release_lock(); } static uint32_t get_max_team_count(execution_space const& espace) { @@ -238,7 +237,8 @@ class TaskQueueSpecializationConstrained< execution_space().impl_internal_space_instance(); const int pool_size = instance->thread_pool_size(); - instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard lock(instance->m_instance_mutex); const int team_size = 1; // Threads per core instance->resize_thread_data(0 /* global reduce buffer */ @@ -250,6 +250,7 @@ class TaskQueueSpecializationConstrained< 0 /* thread local buffer */ ); assert(pool_size % team_size == 0); + auto& queue = scheduler.queue(); queue.initialize_team_queues(pool_size / team_size); @@ -343,8 +344,6 @@ class TaskQueueSpecializationConstrained< } self.disband_team(); } // end pragma omp parallel - - instance->release_lock(); } template diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index ea4e7f6baba..84c7b85f11d 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -146,7 +146,8 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { /*--------------------------------------------------------------------------*/ #include -#include +#include +#include #include /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index a414b34d7c6..635b0e0504f 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -54,9 +54,11 @@ void* OpenMPTargetSpace::impl_allocate( static_assert(sizeof(void*) == sizeof(uintptr_t), "Error sizeof(void*) != sizeof(uintptr_t)"); - void* ptr; + void* ptr = omp_target_alloc(arg_alloc_size, omp_get_default_device()); - ptr = omp_target_alloc(arg_alloc_size, omp_get_default_device()); + if (!ptr) { + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); + } if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index b39f5aca353..6c5eb048e34 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -71,8 +71,6 @@ void OpenMPTargetExec::verify_initialized(const char* const label) { void* OpenMPTargetExec::m_scratch_ptr = nullptr; int64_t OpenMPTargetExec::m_scratch_size = 0; -int* OpenMPTargetExec::m_lock_array = nullptr; -uint64_t OpenMPTargetExec::m_lock_size = 0; uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; @@ -84,15 +82,6 @@ void OpenMPTargetExec::clear_scratch() { m_scratch_size = 0; } -void OpenMPTargetExec::clear_lock_array() { - if (m_lock_array != nullptr) { - Kokkos::Experimental::OpenMPTargetSpace space; - space.deallocate(m_lock_array, m_lock_size); - m_lock_array = nullptr; - m_lock_size = 0; - } -} - void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; } void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, @@ -135,35 +124,6 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, } } -int* OpenMPTargetExec::get_lock_array(int num_teams) { - Kokkos::Experimental::OpenMPTargetSpace space; - int max_active_league_size = MAX_ACTIVE_THREADS / 32; - int lock_array_elem = - (num_teams > max_active_league_size) ? num_teams : max_active_league_size; - if (m_lock_size < (lock_array_elem * sizeof(int))) { - space.deallocate(m_lock_array, m_lock_size); - m_lock_size = lock_array_elem * sizeof(int); - m_lock_array = static_cast(space.allocate(m_lock_size)); - - // FIXME_OPENMPTARGET - Creating a target region here to initialize the - // lock_array with 0's fails. Hence creating an equivalent host array to - // achieve the same. Value of host array are then copied to the lock_array. - int* h_lock_array = static_cast( - omp_target_alloc(m_lock_size, omp_get_initial_device())); - - for (int i = 0; i < lock_array_elem; ++i) h_lock_array[i] = 0; - - if (0 < m_lock_size) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - m_lock_array, h_lock_array, m_lock_size, 0, 0, - omp_get_default_device(), omp_get_initial_device())); - - omp_target_free(h_lock_array, omp_get_initial_device()); - } - - return m_lock_array; -} - } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 3387108da39..44e9119ea88 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -106,7 +106,6 @@ void OpenMPTargetInternal::print_configuration(std::ostream& os, void OpenMPTargetInternal::impl_finalize() { m_is_initialized = false; Kokkos::Impl::OpenMPTargetExec space; - if (space.m_lock_array != nullptr) space.clear_lock_array(); if (space.m_uniquetoken_ptr != nullptr) Kokkos::kokkos_free( diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp index d718f56d38b..e353676b617 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp @@ -22,6 +22,10 @@ namespace Kokkos { namespace Impl { +using OpenMPTargetIterateLeft = std::integral_constant; +using OpenMPTargetIterateRight = + std::integral_constant; + template struct ThreadAndVectorNestLevel +#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::MDRangePolicy; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + const FunctorType m_functor; + const Policy m_policy; + + public: + inline void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + FunctorType functor(m_functor); + Policy policy = m_policy; + + typename Policy::point_type unused; + static_assert(1 < Policy::rank && Policy::rank < 7); + static_assert(Policy::inner_direction == Iterate::Left || + Policy::inner_direction == Iterate::Right); + + execute_tile( + unused, functor, policy, + std::integral_constant()); + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i5 = begin_5; i5 < end_5; ++i5) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i1 = begin_1; i1 < end_1; ++i1) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + // TODO DZP: based on a conversation with Christian, we're using 256 as a + // heuristic here. We need something better once we can query these kinds of + // properties + template + static int max_tile_size_product(const Policy&, const Functor&) { + return 256; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp similarity index 61% rename from lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp rename to lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index 6878531730d..e86a1219749 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -14,128 +14,120 @@ // //@HEADER -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP #include #include -#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" #include -// WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly, -// this was tracked down to a bug in clang with regards of mapping structs -// with arrays of long in it. Arrays of int might be fine though ... -#define KOKKOS_IMPL_MDRANGE_USE_NO_TILES // undef EOF - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { +template +class ParallelReduce, + Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using Index = typename Policy::index_type; - const FunctorType m_functor; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + + static constexpr bool UseReducer = + !std::is_same_v; + + const pointer_type m_result_ptr; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; + using ParReduceCopy = ParallelReduceCopy; + + bool m_result_ptr_on_device; + public: inline void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); - Policy policy = m_policy; - -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - typename Policy::point_type unused; - - execute_tile(unused, functor, policy); -#else - const int64_t begin = 0; - const int64_t end = m_policy.m_num_tiles; - -#pragma omp target teams distribute map(to : functor) num_teams(end - begin) - { - for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) { - -#pragma omp parallel - { - typename Policy::point_type offset; - if (Policy::outer_direction == Policy::Left) { - for (int i = 0; i < Policy::rank; ++i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } else { - for (int i = Policy::rank - 1; i >= 0; --i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } - execute_tile(offset, functor, policy); - } - } - } -#endif + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + execute_tile( + m_functor_reducer.get_functor(), m_policy, m_result_ptr, + std::integral_constant()); } - template + template + inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + Policy arg_policy, const ViewType& arg_result_view) + : m_result_ptr(, + m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr_on_device( + MemorySpaceAccess::accessible) {} + + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to \ + : functor) \ + reduction(custom \ + : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - -#pragma omp for collapse(2) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + } else { +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ +reduction(+:result) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } -#endif + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -144,107 +136,119 @@ class ParallelFor, const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join( \ + omp_out, omp_in)) \ + initializer( \ + OpenMPTargetReducerWrapper ::init( \ + omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - -#pragma omp for collapse(3) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + } else { +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ +reduction(+:result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } -#endif + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; + const Index begin_2 = policy.m_lower[3]; + const Index begin_3 = policy.m_lower[2]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; const Index end_3 = policy.m_upper[3]; -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - -#pragma omp for collapse(4) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + } else { +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ +reduction(+:result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } -#endif + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -257,64 +261,65 @@ class ParallelFor, const Index end_3 = policy.m_upper[3]; const Index end_4 = policy.m_upper[4]; -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - -#pragma omp for collapse(5) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } else { +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ +reduction(+:result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } -#endif + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -329,140 +334,69 @@ class ParallelFor, const Index end_4 = policy.m_upper[4]; const Index end_5 = policy.m_upper[5]; -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); + functor(i0, i1, i2, i3, i4, i5, result); else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); } } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - - const ptrdiff_t begin_5 = offset[5]; - ptrdiff_t end_5 = begin_5 + policy.m_tile[5]; - end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5]; - -#pragma omp for collapse(6) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) - for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5); + } else { +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ +reduction(+:result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); + } } -#endif - } - - inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - // TODO DZP: based on a conversation with Christian, we're using 256 as a - // heuristic here. We need something better once we can query these kinds of - // properties - template - static int max_tile_size_product(const Policy&, const Functor&) { - return 256; - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -class ParallelReduce, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::MDRangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - - static constexpr bool UseReducer = - !std::is_same_v; - - const pointer_type m_result_ptr; - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - - using ParReduceCopy = ParallelReduceCopy; - - bool m_result_ptr_on_device; - - // Only let one ParallelReduce instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; + } + } + } + } + } - public: - inline void execute() const { - execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template - inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - Policy arg_policy, const ViewType& arg_result_view) - : m_result_ptr(, - m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} - template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -509,9 +443,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -567,9 +501,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[3]; @@ -630,9 +564,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -701,9 +635,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -788,5 +722,4 @@ reduction(+:result) //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#undef KOKKOS_IMPL_MDRANGE_USE_NO_TILES -#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ +#endif /* KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index caa568a8925..4a112ed11d0 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -55,13 +55,13 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - // Only let one ParallelReduce instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; using TagType = typename Policy::work_tag; public: void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); const FunctorType& functor = m_functor_reducer.get_functor(); if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. @@ -108,8 +108,7 @@ class ParallelReduce, m_result_ptr_on_device( MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + m_result_ptr_num_elems(arg_result_view.size()) {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 8abffa47a43..16c0eedb818 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -470,12 +470,11 @@ class ParallelReduce m_scratch_memory_lock; - public: void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); const FunctorType& functor = m_functor_reducer.get_functor(); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -521,8 +520,7 @@ class ParallelReduce::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + arg_functor_reducer.get_functor(), arg_policy.team_size())) {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 30195d96e09..b0d69328024 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -143,7 +143,7 @@ class ParallelScan, local_offset_value = element_values(team_id, i - 1); // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs #if defined(KOKKOS_ARCH_AMD_GPU) && !defined(KOKKOS_ARCH_AMD_GFX1030) && \ - !defined(KOKKOS_ARCH_AMD_GFX1100) && !defined(KOKKOS_ARCH_AMD_GFX1103) + !defined(KOKKOS_ARCH_AMD_GFX1100) if constexpr (Analysis::Reducer::has_join_member_function()) { if constexpr (std::is_void_v) a_functor_reducer.get_functor().join(local_offset_value, @@ -177,6 +177,10 @@ class ParallelScan, const idx_type chunk_size = 128; const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View @@ -225,6 +229,10 @@ class ParallelScanWithTotal, const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; if (N > 0) { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 9a246f7642f..4de6931918e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -110,6 +110,31 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif +#ifdef SYCL_EXT_ONEAPI_GRAPH + os << "macro SYCL_EXT_ONEAPI_GRAPH : defined\n"; +#else + os << "macro SYCL_EXT_ONEAPI_GRAPH : undefined\n"; +#endif +#ifdef SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST + if (sycl_queue() + .has_property< + sycl::ext::intel::property::queue::immediate_command_list>()) + os << "Immediate command lists enforced\n"; + else if (sycl_queue() + .has_property()) + os << "Standard command queue enforced\n"; + else +#endif + { + os << "Immediate command lists and standard command queue allowed.\n"; + if (const char* environment_setting = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS")) + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=" + << environment_setting << " takes precedence.\n"; + else + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS not defined.\n"; + } int counter = 0; int active_device = Kokkos::device_id(); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp new file mode 100644 index 00000000000..9c39df94159 --- /dev/null +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp @@ -0,0 +1,157 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SYCL_GRAPHNODEKERNEL_HPP +#define KOKKOS_SYCL_GRAPHNODEKERNEL_HPP + +#include + +#include + +#include +#include +#include + +#include + +namespace Kokkos { +namespace Impl { + +template +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag< + PatternTag, Functor, PolicyType, Args..., + Kokkos::Experimental::SYCL>::type { + public: + using Policy = PolicyType; + using graph_kernel = GraphNodeKernelImpl; + using base_t = typename PatternImplSpecializationFromTag< + PatternTag, Functor, Policy, Args..., Kokkos::Experimental::SYCL>::type; + + // TODO use the name and executionspace + template + GraphNodeKernelImpl(std::string, Kokkos::Experimental::SYCL const&, + Functor arg_functor, PolicyDeduced&& arg_policy, + ArgsDeduced&&... args) + : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, + (ArgsDeduced &&) args...) {} + + template + GraphNodeKernelImpl(Kokkos::Experimental::SYCL const& exec_space, + Functor arg_functor, PolicyDeduced&& arg_policy) + : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), + (PolicyDeduced &&) arg_policy) {} + + void set_sycl_graph_ptr( + sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::modifiable>* + arg_graph) { + m_graph_ptr = arg_graph; + } + + void set_sycl_graph_node_ptr( + std::optional* arg_node) { + m_graph_node_ptr = arg_node; + } + + std::optional& get_sycl_graph_node() + const { + return *m_graph_node_ptr; + } + + sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::modifiable>& + get_sycl_graph() const { + return *m_graph_ptr; + } + + private: + Kokkos::ObservingRawPtr> + m_graph_ptr = nullptr; + Kokkos::ObservingRawPtr> + m_graph_node_ptr = nullptr; +}; + +struct SYCLGraphNodeAggregateKernel { + using graph_kernel = SYCLGraphNodeAggregateKernel; + + // Aggregates don't need a policy, but for the purposes of checking the static + // assertions about graph kernels, + struct Policy { + using is_graph_kernel = std::true_type; + }; +}; + +template ::type> +struct get_graph_node_kernel_type + : type_identity> {}; + +template +struct get_graph_node_kernel_type + : type_identity, + Kokkos::ParallelReduceTag>> {}; + +template +auto& get_sycl_graph_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + auto& graph = kernel_as_graph_kernel.get_sycl_graph(); + + return graph; +} + +template +auto& get_sycl_graph_node_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + auto& graph_node = kernel_as_graph_kernel.get_sycl_graph_node(); + + return graph_node; +} + +template +void sycl_attach_kernel_to_node(Kernel& kernel, const Lambda& lambda) { + sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::modifiable>& graph = + Impl::get_sycl_graph_from_kernel(kernel); + std::optional& graph_node = + Impl::get_sycl_graph_node_from_kernel(kernel); + KOKKOS_ENSURES(!graph_node); + graph_node = graph.add(lambda); + KOKKOS_ENSURES(graph_node); + // FIXME_SYCL_GRAPH not yet implemented in the compiler + // KOKKOS_ENSURES(graph_node.get_type() == + // sycl::ext::oneapi::experimental::node_type::kernel) +} + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp new file mode 100644 index 00000000000..6bbe6711a2e --- /dev/null +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SYCL_GRAPHNODE_IMPL_HPP +#define KOKKOS_SYCL_GRAPHNODE_IMPL_HPP + +#include + +#include + +#include + +#include + +namespace Kokkos { +namespace Impl { +template <> +struct GraphNodeBackendSpecificDetails { + std::optional node; + + explicit GraphNodeBackendSpecificDetails() = default; + + explicit GraphNodeBackendSpecificDetails( + _graph_node_is_root_ctor_tag) noexcept {} +}; + +template +struct GraphNodeBackendDetailsBeforeTypeErasure { + protected: + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::Experimental::SYCL const &, Kernel &, PredecessorRef const &, + GraphNodeBackendSpecificDetails &) noexcept {} + + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::Experimental::SYCL const &, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails &) noexcept {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp new file mode 100644 index 00000000000..1dc4a9c9973 --- /dev/null +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp @@ -0,0 +1,174 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SYCL_GRAPH_IMPL_HPP +#define KOKKOS_SYCL_GRAPH_IMPL_HPP + +#include + +#include + +#include +#include + +#include + +#include + +namespace Kokkos { +namespace Impl { +template <> +class GraphImpl { + public: + using node_details_t = + GraphNodeBackendSpecificDetails; + using root_node_impl_t = GraphNodeImpl; + using aggregate_kernel_impl_t = SYCLGraphNodeAggregateKernel; + using aggregate_node_impl_t = + GraphNodeImpl; + + // Not movable or copyable; it spends its whole life as a shared_ptr in the + // Graph object. + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl const&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; + + ~GraphImpl(); + + explicit GraphImpl(Kokkos::Experimental::SYCL instance); + + void add_node(std::shared_ptr const& arg_node_ptr); + + template + void add_node(std::shared_ptr const& arg_node_ptr); + + template + void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); + + void submit(); + + Kokkos::Experimental::SYCL const& get_execution_space() const noexcept; + + auto create_root_node_ptr(); + + template + auto create_aggregate_ptr(PredecessorRefs&&...); + + private: + void instantiate_graph() { m_graph_exec = m_graph.finalize(); } + + Kokkos::Experimental::SYCL m_execution_space; + sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::modifiable> + m_graph; + std::optional> + m_graph_exec; +}; + +inline GraphImpl::~GraphImpl() { + m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); +} + +inline GraphImpl::GraphImpl( + Kokkos::Experimental::SYCL instance) + : m_execution_space(std::move(instance)), + m_graph(m_execution_space.sycl_queue().get_context(), + m_execution_space.sycl_queue().get_device()) {} + +inline void GraphImpl::add_node( + std::shared_ptr const& arg_node_ptr) { + // add an empty node that needs to be set up before finalizing the graph + arg_node_ptr->node_details_t::node = m_graph.add(); +} + +// Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl +// Also requires that the kernel has the graph node tag in its policy +template +inline void GraphImpl::add_node( + std::shared_ptr const& arg_node_ptr) { + static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); + KOKKOS_EXPECTS(arg_node_ptr); + // The Kernel launch from the execute() method has been shimmed to insert + // the node into the graph + auto& kernel = arg_node_ptr->get_kernel(); + auto& node = static_cast(arg_node_ptr.get())->node; + KOKKOS_EXPECTS(!node); + kernel.set_sycl_graph_ptr(&m_graph); + kernel.set_sycl_graph_node_ptr(&node); + kernel.execute(); + KOKKOS_ENSURES(node); +} + +// Requires PredecessorRef is a specialization of GraphNodeRef that has +// already been added to this graph and NodeImpl is a specialization of +// GraphNodeImpl that has already been added to this graph. +template +inline void GraphImpl::add_predecessor( + NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { + KOKKOS_EXPECTS(arg_node_ptr); + auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); + KOKKOS_EXPECTS(pred_ptr); + + auto& pred_node = pred_ptr->node_details_t::node; + KOKKOS_EXPECTS(pred_node); + + auto& node = arg_node_ptr->node_details_t::node; + KOKKOS_EXPECTS(node); + + m_graph.make_edge(*pred_node, *node); +} + +inline void GraphImpl::submit() { + if (!m_graph_exec) { + instantiate_graph(); + } + m_execution_space.sycl_queue().ext_oneapi_graph(*m_graph_exec); +} + +inline Kokkos::Experimental::SYCL const& +GraphImpl::get_execution_space() const noexcept { + return m_execution_space; +} + +inline auto GraphImpl::create_root_node_ptr() { + KOKKOS_EXPECTS(!m_graph_exec); + auto rv = std::make_shared(get_execution_space(), + _graph_node_is_root_ctor_tag{}); + rv->node_details_t::node = m_graph.add(); + return rv; +} + +template +inline auto GraphImpl::create_aggregate_ptr( + PredecessorRefs&&...) { + // The attachment to predecessors, which is all we really need, happens + // in the generic layer, which calls through to add_predecessor for + // each predecessor ref, so all we need to do here is create the (trivial) + // aggregate node. + return std::make_shared(m_execution_space, + _graph_node_kernel_ctor_tag{}, + aggregate_kernel_impl_t{}); +} +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 0e67adb5787..5843dca8123 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,26 +166,27 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -sycl::device_ptr SYCLInternal::resize_team_scratch_space( +Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - if (m_team_scratch_current_size[scratch_pool_id] == 0) { + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_malloc( - "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && (force_shrink))) { + mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_realloc( - m_team_scratch_ptr[scratch_pool_id], - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -234,8 +235,8 @@ void SYCLInternal::finalize() { for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) { - Kokkos::kokkos_free( - m_team_scratch_ptr[i]); + device_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; } @@ -250,7 +251,8 @@ void SYCLInternal::finalize() { m_queue.reset(); } -sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( + const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); @@ -270,7 +272,8 @@ sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { return m_scratchSpace; } -sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { +Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( + const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); @@ -290,7 +293,8 @@ sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { return m_scratchHost; } -sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( + const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index ab7e8ce71e0..2d784ef8a5f 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,13 +43,12 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - sycl::device_ptr scratch_space(const std::size_t size); - sycl::device_ptr scratch_flags(const std::size_t size); - sycl::host_ptr scratch_host(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); + Kokkos::Impl::sycl_host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, - std::int64_t bytes, - bool force_shrink = false); + Kokkos::Impl::sycl_device_ptr resize_team_scratch_space( + int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); uint32_t impl_get_instance_id() const; @@ -59,21 +58,22 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - sycl::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - sycl::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - sycl::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + Kokkos::Impl::sycl_host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space - static constexpr int m_n_team_scratch = 10; - mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable sycl::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; - mutable int m_current_team_scratch = 0; - mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; + static constexpr int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; + mutable Kokkos::Impl::sycl_device_ptr + m_team_scratch_ptr[m_n_team_scratch] = {}; + mutable int m_current_team_scratch = 0; + mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index 7fbf5420f83..cb7b1048da3 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -120,7 +120,7 @@ class Kokkos::Impl::ParallelFor, desul::ensure_sycl_lock_arrays_on_device(q); - auto parallel_for_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { const auto range = compute_ranges(); const sycl::range<3> global_range = range.get_global_range(); const sycl::range<3> local_range = range.get_local_range(); @@ -153,12 +153,22 @@ class Kokkos::Impl::ParallelFor, {global_x, global_y, global_z}, {local_x, local_y, local_z}) .exec_range(); }); - }); -#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier(std::vector{parallel_for_event}); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + return {}; + } else #endif + { + auto parallel_for_event = q.submit(cgh_lambda); - return parallel_for_event; +#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES + q.ext_oneapi_submit_barrier(std::vector{parallel_for_event}); +#endif + return parallel_for_event; + } } public: @@ -181,12 +191,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy), diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index b4de7eb89ff..8ef43d392c6 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -17,11 +17,15 @@ #ifndef KOKKOS_SYCL_PARALLEL_FOR_RANGE_HPP_ #define KOKKOS_SYCL_PARALLEL_FOR_RANGE_HPP_ +#ifdef SYCL_EXT_ONEAPI_AUTO_LOCAL_RANGE +#include +#endif #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES #include #endif namespace Kokkos::Impl { +#ifndef SYCL_EXT_ONEAPI_AUTO_LOCAL_RANGE template struct FunctorWrapperRangePolicyParallelFor { using WorkTag = typename Policy::work_tag; @@ -37,14 +41,15 @@ struct FunctorWrapperRangePolicyParallelFor { typename Policy::index_type m_begin; FunctorWrapper m_functor_wrapper; }; +#endif // Same as above but for a user-provided workgroup size template struct FunctorWrapperRangePolicyParallelForCustom { using WorkTag = typename Policy::work_tag; - void operator()(sycl::item<1> item) const { - const typename Policy::index_type id = item.get_linear_id(); + void operator()(sycl::nd_item<1> item) const { + const typename Policy::index_type id = item.get_global_linear_id(); if (id < m_work_size) { const auto shifted_id = id + m_begin; if constexpr (std::is_void_v) @@ -74,27 +79,47 @@ class Kokkos::Impl::ParallelFor, const Policy m_policy; template - static sycl::event sycl_direct_launch(const Policy& policy, - const Functor& functor, - const sycl::event& memcpy_event) { + sycl::event sycl_direct_launch(const Policy& policy, const Functor& functor, + const sycl::event& memcpy_event) const { // Convenience references const Kokkos::Experimental::SYCL& space =; sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); - auto parallel_for_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); #else (void)memcpy_event; #endif + if (policy.chunk_size() <= 1) { +#ifdef SYCL_EXT_ONEAPI_AUTO_LOCAL_RANGE + const auto actual_range = policy.end() - policy.begin(); + FunctorWrapperRangePolicyParallelForCustom f{ + policy.begin(), functor, actual_range}; + // Round the actual range up to the closest power of two not exceeding + // the maximum workgroup size + const auto max_wgroup_size = + q.get_device().get_info(); + const auto wgroup_size_multiple = Kokkos::bit_floor( + std::min(max_wgroup_size, actual_range)); + + const auto launch_range = (actual_range + wgroup_size_multiple - 1) / + wgroup_size_multiple * wgroup_size_multiple; + sycl::nd_range<1> range( + launch_range, sycl::ext::oneapi::experimental::auto_range<1>()); + cgh.parallel_for< + FunctorWrapperRangePolicyParallelForCustom>(range, + f); +#else FunctorWrapperRangePolicyParallelFor f{policy.begin(), functor}; sycl::range<1> range(policy.end() - policy.begin()); cgh.parallel_for>( range, f); +#endif } else { // Use the chunk size as workgroup size. We need to make sure that the // range the kernel is launched with is a multiple of the workgroup @@ -111,12 +136,22 @@ class Kokkos::Impl::ParallelFor, FunctorWrapperRangePolicyParallelForCustom>(range, f); } - }); -#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier(std::vector{parallel_for_event}); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + return {}; + } else #endif + { + auto parallel_for_event = q.submit(cgh_lambda); - return parallel_for_event; +#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES + q.ext_oneapi_submit_barrier(std::vector{parallel_for_event}); +#endif + return parallel_for_event; + } } public: @@ -137,12 +172,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy) {} }; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index ecb4a863da2..cf7f582bc79 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -22,13 +22,14 @@ #include #include +#include #include template class Kokkos::Impl::ParallelFor, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using functor_type = FunctorType; using size_type = ::Kokkos::Experimental::SYCL::size_type; @@ -44,24 +45,19 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor instance at a time use the team scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; template - sycl::event sycl_direct_launch(const Policy& policy, + sycl::event sycl_direct_launch(const sycl_device_ptr global_scratch_ptr, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space =; + const Kokkos::Experimental::SYCL& space =; sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); - auto parallel_for_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least for // host queues sycl::local_accessor team_scratch_memory_L0( @@ -72,7 +68,6 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -114,28 +109,53 @@ class Kokkos::Impl::ParallelFor, sycl::range<2>(m_team_size, m_league_size * final_vector_size), sycl::range<2>(m_team_size, final_vector_size)), lambda); - }); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + return {}; + } else +#endif + { + auto parallel_for_event = q.submit(cgh_lambda); + #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier(std::vector{parallel_for_event}); + q.ext_oneapi_submit_barrier(std::vector{parallel_for_event}); #endif - return parallel_for_event; + return parallel_for_event; + } } public: inline void execute() const { if (m_league_size == 0) return; - auto& space = *; + auto& instance = *; + + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = space.get_indirect_kernel_mem(); + indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( m_functor, indirectKernelMem); - sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, + sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); - space.register_team_scratch_event(m_scratch_pool_id, event); + instance.register_team_scratch_event(scratch_pool_id, event); } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -143,10 +163,7 @@ class Kokkos::Impl::ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock( - .impl_internal_space_instance() - ->m_team_scratch_mutex) { + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = @@ -159,22 +176,14 @@ class Kokkos::Impl::ParallelFor, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - auto& space = *; - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const auto& instance = *; + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index f55280e22e3..0774b24bca1 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -77,9 +77,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( - m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -94,10 +92,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + sycl_device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -108,13 +106,13 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible @@ -129,12 +127,20 @@ class Kokkos::Impl::ParallelReduce{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } else { // Otherwise (when n_tiles is not zero), we perform a reduction on the // values in all workgroups separately, write the workgroup results back @@ -155,16 +161,16 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto scratch_flags = static_cast>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); sycl::local_accessor num_teams_done(1, cgh); @@ -298,12 +304,19 @@ class Kokkos::Impl::ParallelReduce{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } // At this point, the reduced value is written to the entry in results_ptr @@ -311,6 +324,11 @@ class Kokkos::Impl::ParallelReduce::execute: result " "not device-accessible"); @@ -330,6 +348,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -349,10 +373,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 5333e3c8a83..2d46ffc77dc 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -50,9 +50,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( ->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -69,10 +67,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + sycl_device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,10 +86,10 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { const auto begin = policy.begin(); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -114,24 +112,32 @@ class Kokkos::Impl::ParallelReduce{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } else { // Otherwise (when size > 1), we perform a reduction on the values in all // workgroups separately, write the workgroup results back to global // memory and recurse until only one workgroup does the reduction and thus // gets the final value. - auto scratch_flags = static_cast>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::device_ptr results_ptr, int values_per_thread) { + sycl_device_ptr results_ptr, int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -241,7 +247,7 @@ class Kokkos::Impl::ParallelReduce num_teams_done(1, cgh); auto dummy_reduction_lambda = @@ -302,7 +308,7 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( + static_cast>(instance.scratch_space( sizeof(value_type) * value_count * n_wgroups)); sycl::local_accessor local_mem( @@ -320,12 +326,20 @@ class Kokkos::Impl::ParallelReduce(n_wgroups * wgroup_size, wgroup_size), reduction_lambda); - }); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + } else +#endif + { + last_reduction_event = q.submit(cgh_lambda); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier( - std::vector{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } // At this point, the reduced value is written to the entry in results_ptr @@ -333,6 +347,11 @@ class Kokkos::Impl::ParallelReduce::execute: result " "not device-accessible"); @@ -347,6 +366,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -366,10 +391,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 27165c59e3a..b443bcbf902 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -23,6 +23,7 @@ #include #include +#include #include template @@ -30,7 +31,7 @@ class Kokkos::Impl::ParallelReduce, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; @@ -54,24 +55,18 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; const size_type m_vector_size; - // Only let one ParallelReduce instance at a time use the team scratch memory - // and the host scratch memory. The constructor acquires the mutex which is - // released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; - template + template sycl::event sycl_direct_launch( - const PolicyType& policy, + const sycl_device_ptr global_scratch_ptr, const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space =; + const Kokkos::Experimental::SYCL& space =; Kokkos::Experimental::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -82,7 +77,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -95,14 +90,14 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( + static_cast>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues sycl::local_accessor team_scratch_memory_L0( @@ -113,7 +108,6 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -144,19 +138,26 @@ class Kokkos::Impl::ParallelReduce{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } else { // Otherwise, (if the total range has more than one element) we perform a // reduction on the values in all workgroups separately, write the // workgroup results back to global memory and recurse until only one // workgroup does the reduction and thus gets the final value. - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { - auto scratch_flags = static_cast>( + auto cgh_lambda = [&](sycl::handler& cgh) { + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -170,12 +171,11 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - sycl::device_ptr results_ptr) { + sycl_device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -331,7 +331,7 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); results_ptr = - static_cast>(instance.scratch_space( + static_cast>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u) * init_size)); size_t max_work_groups = @@ -359,12 +359,19 @@ class Kokkos::Impl::ParallelReduce(m_team_size, n_wgroups * m_vector_size), sycl::range<2>(m_team_size, m_vector_size)), reduction_lambda); - }); + }; +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + } else +#endif + { + last_reduction_event = q.submit(cgh_lambda); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier( - std::vector{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } // At this point, the reduced value is written to the entry in results_ptr @@ -372,6 +379,11 @@ class Kokkos::Impl::ParallelReduce::execute: result not " "device-accessible"); @@ -386,6 +398,22 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -395,14 +423,24 @@ class Kokkos::Impl::ParallelReduce + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(, + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = m_policy.team_size_recommended( @@ -423,22 +461,15 @@ class Kokkos::Impl::ParallelReduce>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const Kokkos::Experimental::Impl::SYCLInternal& instance = + *; + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } @@ -448,25 +479,6 @@ class Kokkos::Impl::ParallelReduce requested too large team size."); } - - public: - template - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(, - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock( - .impl_internal_space_instance() - ->m_team_scratch_mutex) { - initialize(); - } }; #endif diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 977b69bc9eb..bdb5b883770 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -18,6 +18,7 @@ #define KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP #include +#include #include #include @@ -35,20 +36,38 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, auto sg = item.get_sub_group(); const int sg_group_id = sg.get_group_id()[0]; const int id_in_sg = sg.get_local_id()[0]; - - for (int stride = 1; stride < global_range; stride <<= 1) { - auto tmp = sg.shuffle_up(local_value, stride); + const int local_range = std::min(sg.get_local_range()[0], global_range); + +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int stride) { + if (stride < local_range) { + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right(sg, local_value, + stride); + if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(local_range <= 32); +#else + for (int stride = 1; stride < local_range; stride <<= 1) { + auto tmp = + Kokkos::Impl::SYCLReduction::shift_group_right(sg, local_value, stride); if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); } +#endif const int max_subgroup_size = sg.get_max_local_range()[0]; const int n_active_subgroups = (global_range + max_subgroup_size - 1) / max_subgroup_size; - const int local_range = sg.get_local_range()[0]; if (id_in_sg == local_range - 1 && sg_group_id < n_active_subgroups) local_mem[sg_group_id] = local_value; - local_value = sg.shuffle_up(local_value, 1); + local_value = + Kokkos::Impl::SYCLReduction::shift_group_right(sg, local_value, 1); if (id_in_sg == 0) final_reducer.init(&local_value); sycl::group_barrier(item.get_group()); @@ -61,8 +80,29 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, const auto upper_bound = std::min(local_range, n_active_subgroups - round * local_range); auto local_sg_value = local_mem[idx < n_active_subgroups ? idx : 0]; +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine_sg = [&](int stride) { + if (stride < upper_bound) { + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right( + sg, local_sg_value, stride); + if (id_in_sg >= stride) { + if (idx < n_active_subgroups) + final_reducer.join(&local_sg_value, &tmp); + else + local_sg_value = tmp; + } + } + }; + shuffle_combine_sg(1); + shuffle_combine_sg(2); + shuffle_combine_sg(4); + shuffle_combine_sg(8); + shuffle_combine_sg(16); + KOKKOS_ASSERT(upper_bound <= 32); +#else for (int stride = 1; stride < upper_bound; stride <<= 1) { - auto tmp = sg.shuffle_up(local_sg_value, stride); + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right( + sg, local_sg_value, stride); if (id_in_sg >= stride) { if (idx < n_active_subgroups) final_reducer.join(&local_sg_value, &tmp); @@ -70,6 +110,7 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, local_sg_value = tmp; } } +#endif if (idx < n_active_subgroups) { local_mem[idx] = local_sg_value; if (round > 0) @@ -111,14 +152,10 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - sycl::host_ptr m_scratch_host = nullptr; + sycl_host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one ParallelScan instance at a time use the host scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - private: template sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, @@ -131,95 +168,93 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = static_cast>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); // Initialize global memory - auto scan_lambda_factory = - [&](sycl::local_accessor local_mem, - sycl::local_accessor num_teams_done, - sycl::device_ptr global_mem_, - sycl::device_ptr group_results_) { - auto lambda = [=](sycl::nd_item<1> item) { - auto global_mem = global_mem_; - auto group_results = group_results_; - - const CombinedFunctorReducer< - FunctorType, typename Analysis::Reducer>& functor_reducer = - functor_wrapper.get_functor(); - const FunctorType& functor = functor_reducer.get_functor(); - const typename Analysis::Reducer& reducer = - functor_reducer.get_reducer(); - - const auto n_wgroups = item.get_group_range()[0]; - const int wgroup_size = item.get_local_range()[0]; - - const int local_id = item.get_local_linear_id(); - const index_type global_id = item.get_global_linear_id(); - - // Initialize local memory - value_type local_value; - reducer.init(&local_value); - if (global_id < size) { - if constexpr (std::is_void::value) - functor(global_id + begin, local_value, false); - else - functor(WorkTag(), global_id + begin, local_value, false); - } + auto scan_lambda_factory = [&](sycl::local_accessor local_mem, + sycl::local_accessor + num_teams_done, + sycl_device_ptr global_mem_, + sycl_device_ptr group_results_) { + auto lambda = [=](sycl::nd_item<1> item) { + auto global_mem = global_mem_; + auto group_results = group_results_; + + const CombinedFunctorReducer& + functor_reducer = functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); + + const auto n_wgroups = item.get_group_range()[0]; + const int wgroup_size = item.get_local_range()[0]; + + const int local_id = item.get_local_linear_id(); + const index_type global_id = item.get_global_linear_id(); + + // Initialize local memory + value_type local_value; + reducer.init(&local_value); + if (global_id < size) { + if constexpr (std::is_void::value) + functor(global_id + begin, local_value, false); + else + functor(WorkTag(), global_id + begin, local_value, false); + } - workgroup_scan<>(item, reducer, local_mem, local_value, - wgroup_size); + workgroup_scan<>(item, reducer, local_mem, local_value, wgroup_size); - // Write results to global memory - if (global_id < size) global_mem[global_id] = local_value; + // Write results to global memory + if (global_id < size) global_mem[global_id] = local_value; - if (local_id == wgroup_size - 1) { - group_results[item.get_group_linear_id()] = - local_mem[item.get_sub_group().get_group_range()[0] - 1]; + if (local_id == wgroup_size - 1) { + group_results[item.get_group_linear_id()] = + local_mem[item.get_sub_group().get_group_range()[0] - 1]; - sycl::atomic_ref - scratch_flags_ref(*scratch_flags); - num_teams_done[0] = ++scratch_flags_ref; - } - item.barrier(sycl::access::fence_space::global_space); - if (num_teams_done[0] == n_wgroups) { - if (local_id == 0) *scratch_flags = 0; - value_type total; - reducer.init(&total); - - for (unsigned int offset = 0; offset < n_wgroups; - offset += wgroup_size) { - index_type id = local_id + offset; - if (id < static_cast(n_wgroups)) - local_value = group_results[id]; - else - reducer.init(&local_value); - workgroup_scan<>( - item, reducer, local_mem, local_value, - std::min(n_wgroups - offset, wgroup_size)); - if (id < static_cast(n_wgroups)) { - reducer.join(&local_value, &total); - group_results[id] = local_value; - } - reducer.join( - &total, - &local_mem[item.get_sub_group().get_group_range()[0] - 1]); - if (offset + wgroup_size < n_wgroups) - item.barrier(sycl::access::fence_space::global_space); - } + sycl::atomic_ref + scratch_flags_ref(*scratch_flags); + num_teams_done[0] = ++scratch_flags_ref; + } + item.barrier(sycl::access::fence_space::global_space); + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; + value_type total; + reducer.init(&total); + + for (unsigned int offset = 0; offset < n_wgroups; + offset += wgroup_size) { + index_type id = local_id + offset; + if (id < static_cast(n_wgroups)) + local_value = group_results[id]; + else + reducer.init(&local_value); + workgroup_scan<>( + item, reducer, local_mem, local_value, + std::min(n_wgroups - offset, wgroup_size)); + if (id < static_cast(n_wgroups)) { + reducer.join(&local_value, &total); + group_results[id] = local_value; } - }; - return lambda; - }; + reducer.join( + &total, + &local_mem[item.get_sub_group().get_group_range()[0] - 1]); + if (offset + wgroup_size < n_wgroups) + item.barrier(sycl::access::fence_space::global_space); + } + } + }; + return lambda; + }; size_t wgroup_size; size_t n_wgroups; - sycl::device_ptr global_mem; - sycl::device_ptr group_results; + sycl_device_ptr global_mem; + sycl_device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -254,9 +289,9 @@ class ParallelScanSYCLBase { // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass global_mem = - static_cast>(instance.scratch_space( + static_cast>(instance.scratch_space( n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_host = static_cast>( + m_scratch_host = static_cast>( instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; @@ -334,6 +369,11 @@ class ParallelScanSYCLBase { auto& instance = *; + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock scratch_buffers_lock( + instance.m_mutexScratchSpace); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -352,10 +392,7 @@ class ParallelScanSYCLBase { : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_scratch_buffers_lock( - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} }; } // namespace Kokkos::Impl diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 9cc8008cdf3..19fad29150e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -56,6 +56,23 @@ void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) { /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ +namespace { + +std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { + switch (allocation_kind) { + case sycl::usm::alloc::host: + return Kokkos::Experimental::SYCLHostUSMSpace::name(); + case sycl::usm::alloc::device: + return Kokkos::Experimental::SYCLDeviceUSMSpace::name(); + case sycl::usm::alloc::shared: + return Kokkos::Experimental::SYCLSharedUSMSpace::name(); + default: + Kokkos::abort("bug: unknown sycl allocation type"); + return "unreachable"; + } +} + +} // namespace namespace Kokkos { namespace Experimental { @@ -75,17 +92,17 @@ SYCLHostUSMSpace::SYCLHostUSMSpace() SYCLHostUSMSpace::SYCLHostUSMSpace(sycl::queue queue) : m_queue(std::move(queue)) {} -void* allocate_sycl( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle, - const RawMemoryAllocationFailure::AllocationMechanism failure_tag, - const sycl::usm::alloc allocation_kind, const sycl::queue& queue) { +void* allocate_sycl(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle, + const sycl::usm::alloc allocation_kind, + const sycl::queue& queue) { void* const hostPtr = sycl::malloc(arg_alloc_size, queue, allocation_kind); - if (hostPtr == nullptr) - throw RawMemoryAllocationFailure( - arg_alloc_size, 1, RawMemoryAllocationFailure::FailureMode::Unknown, - failure_tag); + if (hostPtr == nullptr) { + Kokkos::Impl::throw_bad_alloc(get_memory_space_name(allocation_kind), + arg_alloc_size, arg_label); + } if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = @@ -106,12 +123,10 @@ void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocDevice, - sycl::usm::alloc::device, - *exec_space.impl_internal_space_instance()->m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::device, + *exec_space.impl_internal_space_instance()->m_queue); } void* SYCLDeviceUSMSpace::allocate(const size_t arg_alloc_size) const { @@ -121,11 +136,9 @@ void* SYCLDeviceUSMSpace::allocate(const size_t arg_alloc_size) const { void* SYCLDeviceUSMSpace::allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocDevice, - sycl::usm::alloc::device, m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::device, m_queue); } void* SYCLSharedUSMSpace::allocate(const SYCL& exec_space, @@ -136,12 +149,10 @@ void* SYCLSharedUSMSpace::allocate(const SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocShared, - sycl::usm::alloc::shared, - *exec_space.impl_internal_space_instance()->m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::shared, + *exec_space.impl_internal_space_instance()->m_queue); } void* SYCLSharedUSMSpace::allocate(const size_t arg_alloc_size) const { @@ -150,11 +161,9 @@ void* SYCLSharedUSMSpace::allocate(const size_t arg_alloc_size) const { void* SYCLSharedUSMSpace::allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocShared, - sycl::usm::alloc::shared, m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::shared, m_queue); } void* SYCLHostUSMSpace::allocate(const SYCL& exec_space, @@ -164,12 +173,10 @@ void* SYCLHostUSMSpace::allocate(const SYCL& exec_space, void* SYCLHostUSMSpace::allocate(const SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocHost, - sycl::usm::alloc::host, - *exec_space.impl_internal_space_instance()->m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::host, + *exec_space.impl_internal_space_instance()->m_queue); } void* SYCLHostUSMSpace::allocate(const size_t arg_alloc_size) const { @@ -178,11 +185,9 @@ void* SYCLHostUSMSpace::allocate(const size_t arg_alloc_size) const { void* SYCLHostUSMSpace::allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocHost, - sycl::usm::alloc::host, m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::host, m_queue); } void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr, diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index dbba3827581..1e42faa5a83 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -22,6 +22,7 @@ #ifdef KOKKOS_ENABLE_SYCL #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -133,72 +134,71 @@ class SYCLTeamMember { const unsigned int team_rank_ = team_rank(); // First combine the values in the same subgroup +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int shift) { + if (vector_range * shift < sub_group_range) { + const value_type tmp = Kokkos::Impl::SYCLReduction::shift_group_left( + sg, value, vector_range * shift); + if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(sub_group_range <= 32); +#else for (unsigned int shift = 1; vector_range * shift < sub_group_range; shift <<= 1) { - const value_type tmp = sg.shuffle_down(value, vector_range * shift); + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_left( + sg, value, vector_range * shift); if (team_rank_ + shift < team_size_) reducer.join(value, tmp); } - value = sg.shuffle(value, 0); +#endif + value = Kokkos::Impl::SYCLReduction::select_from_group(sg, value, 0); - const auto n_subgroups = sg.get_group_range()[0]; + const int n_subgroups = sg.get_group_range()[0]; if (n_subgroups == 1) { reducer.reference() = value; return; } - // We need to chunk up the whole reduction because we might not have - // allocated enough memory. - const unsigned int maximum_work_range = - std::min(m_team_reduce_size / sizeof(value_type), n_subgroups); + // It was found experimentally that 16 is a good value for Intel PVC. + // Since there is a maximum number of 1024 threads with subgroup size 16, + // we have a maximum of 64 subgroups per workgroup which means 64/16=4 + // rounds for loading values into the reduction_array, and 16 redundant + // reduction steps executed by every thread. + constexpr int step_width = 16; + auto tmp_alloc = sycl::ext::oneapi::group_local_memory_for_overwrite< + value_type[step_width]>(m_item.get_group()); + auto& reduction_array = *tmp_alloc; const auto id_in_sg = sg.get_local_id()[0]; - auto reduction_array = - static_cast>(m_team_reduce); - // Load values into the first maximum_work_range values of the reduction + // Load values into the first step_width values of the reduction // array in chunks. This means that only sub groups with an id in the // corresponding chunk load values. - const auto group_id = sg.get_group_id()[0]; - if (id_in_sg == 0 && group_id < maximum_work_range) + const int group_id = sg.get_group_id()[0]; + if (id_in_sg == 0 && group_id < step_width) reduction_array[group_id] = value; sycl::group_barrier(m_item.get_group()); - for (unsigned int start = maximum_work_range; start < n_subgroups; - start += maximum_work_range) { + for (int start = step_width; start < n_subgroups; start += step_width) { if (id_in_sg == 0 && group_id >= start && - group_id < - std::min(start + maximum_work_range, n_subgroups)) + group_id < std::min(start + step_width, n_subgroups)) reducer.join(reduction_array[group_id - start], value); sycl::group_barrier(m_item.get_group()); } - // Let the first subgroup do the final reduction - if (group_id == 0) { - const auto local_range = sg.get_local_range()[0]; - auto result = - reduction_array[id_in_sg < maximum_work_range ? id_in_sg : 0]; - // In case the maximum_work_range is larger than the range of the first - // subgroup, we first combine the items with a higher index. - for (unsigned int offset = local_range; offset < maximum_work_range; - offset += local_range) - if (id_in_sg + offset < maximum_work_range) - reducer.join(result, reduction_array[id_in_sg + offset]); - sycl::group_barrier(sg); - - // Now do the actual subgroup reduction. - const auto min_range = - std::min(maximum_work_range, local_range); - for (unsigned int stride = 1; stride < min_range; stride <<= 1) { - const auto tmp = sg.shuffle_down(result, stride); - if (id_in_sg + stride < min_range) reducer.join(result, tmp); - } - if (id_in_sg == 0) reduction_array[0] = result; - } - sycl::group_barrier(m_item.get_group()); + // Do the final reduction for all threads redundantly + value = reduction_array[0]; + for (int i = 1; i < std::min(step_width, n_subgroups); ++i) + reducer.join(value, reduction_array[i]); - reducer.reference() = reduction_array[0]; - // Make sure that the reduction array hasn't been modified in the meantime. - m_item.barrier(sycl::access::fence_space::local_space); + reducer.reference() = value; + // Make sure that every thread is done using the reduction array. + sycl::group_barrier(m_item.get_group()); } //-------------------------------------------------------------------------- @@ -223,7 +223,8 @@ class SYCLTeamMember { // First combine the values in the same subgroup for (unsigned int stride = 1; vector_range * stride < sub_group_range; stride <<= 1) { - auto tmp = sg.shuffle_up(value, vector_range * stride); + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right( + sg, value, vector_range * stride); if (id_in_sg >= vector_range * stride) value += tmp; } @@ -249,7 +250,8 @@ class SYCLTeamMember { sub_group_range, n_active_subgroups - round * sub_group_range); auto local_value = base_data[idx]; for (unsigned int stride = 1; stride < upper_bound; stride <<= 1) { - auto tmp = sg.shuffle_up(local_value, stride); + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right( + sg, local_value, stride); if (id_in_sg >= stride) { if (idx < n_active_subgroups) local_value += tmp; @@ -267,7 +269,8 @@ class SYCLTeamMember { } auto total = base_data[n_active_subgroups - 1]; - const auto update = sg.shuffle_up(value, vector_range); + const auto update = + Kokkos::Impl::SYCLReduction::shift_group_right(sg, value, vector_range); Type intermediate = (group_id > 0 ? base_data[group_id - 1] : 0) + (id_in_sg >= vector_range ? update : 0); @@ -320,7 +323,7 @@ class SYCLTeamMember { typename ReducerType::value_type tmp2 = tmp; for (int i = grange1; (i >>= 1);) { - tmp2 = sg.shuffle_down(tmp, i); + tmp2 = Kokkos::Impl::SYCLReduction::shift_group_left(sg, tmp, i); if (static_cast(tidx1) < i) { reducer.join(tmp, tmp2); } @@ -331,8 +334,9 @@ class SYCLTeamMember { // because floating point summation is not associative // and thus different threads could have different results. - tmp2 = sg.shuffle(tmp, (sg.get_local_id() / grange1) * grange1); - value = tmp2; + tmp2 = Kokkos::Impl::SYCLReduction::select_from_group( + sg, tmp, (sg.get_local_id() / grange1) * grange1); + value = tmp2; reducer.reference() = tmp2; } @@ -342,7 +346,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl::device_ptr scratch_level_1_ptr, + sycl_device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) @@ -839,7 +843,8 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< // [t] += [t-4] if t >= 4 // ... for (int j = 1; j < static_cast(grange1); j <<= 1) { - value_type tmp = sg.shuffle_up(val, j); + value_type tmp = + Kokkos::Impl::SYCLReduction::shift_group_right(sg, val, j); if (j <= static_cast(tidx1)) { reducer.join(val, tmp); } @@ -850,7 +855,8 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< // Update i's contribution into the val and add it to accum for next round if (i < loop_boundaries.end) closure(i, val, true); - accum = sg.shuffle(val, mask + vector_offset); + accum = Kokkos::Impl::SYCLReduction::select_from_group( + sg, val, mask + vector_offset); } reducer.reference() = accum; } @@ -927,7 +933,8 @@ KOKKOS_INLINE_FUNCTION void single( const auto grange1 = item.get_local_range(1); const auto sg = item.get_sub_group(); if (item.get_local_id(1) == 0) lambda(val); - val = sg.shuffle(val, (sg.get_local_id() / grange1) * grange1); + val = Kokkos::Impl::SYCLReduction::select_from_group( + sg, val, (sg.get_local_id() / grange1) * grange1); } template diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index c308384af09..abf0bd8f53e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -21,8 +21,53 @@ namespace Kokkos::Impl::SYCLReduction { -// FIXME_SYCL It appears that using shuffles is slower than going through local -// memory. +template +struct TrivialWrapper { + std::byte array[N]; +}; + +// shuffle down +template +T shift_group_left(sycl::sub_group sg, T x, + sycl::sub_group::linear_id_type delta) { + if constexpr (std::is_trivially_copyable_v) + return sycl::shift_group_left(sg, x, delta); + else { + auto tmp = sycl::shift_group_left( + sg, reinterpret_cast&>(x), delta); + return reinterpret_cast(tmp); + } +} + +// shuffle up +template +T shift_group_right(sycl::sub_group sg, T x, + sycl::sub_group::linear_id_type delta) { + if constexpr (std::is_trivially_copyable_v) + return sycl::shift_group_right(sg, x, delta); + else { + auto tmp = sycl::shift_group_right( + sg, reinterpret_cast&>(x), delta); + return reinterpret_cast(tmp); + } +} + +// shuffle +template +T select_from_group(sycl::sub_group sg, T x, + sycl::sub_group::id_type remote_local_id) { + if constexpr (std::is_trivially_copyable_v) + return sycl::select_from_group(sg, x, remote_local_id); + else { + auto tmp = sycl::select_from_group( + sg, reinterpret_cast&>(x), remote_local_id); + return reinterpret_cast(tmp); + } +} + +// FIXME_SYCL For some types, shuffle reductions are competitive with local +// memory reductions but they are significantly slower for the value type used +// in combined reductions with multiple double arguments. template inline constexpr bool use_shuffle_based_algorithm = false; // std::is_reference_v; @@ -30,7 +75,7 @@ inline constexpr bool use_shuffle_based_algorithm = false; template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - sycl::device_ptr results_ptr, + sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -102,24 +147,40 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, sycl::device_ptr results_ptr, + ValueType local_value, sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); // Perform the actual workgroup reduction in each subgroup // separately. - auto sg = item.get_sub_group(); - const int id_in_sg = sg.get_local_id()[0]; - const auto local_range = - std::min(sg.get_local_range()[0], max_size); + auto sg = item.get_sub_group(); + const int id_in_sg = sg.get_local_id()[0]; + const int local_range = std::min(sg.get_local_range()[0], max_size); const auto upper_stride_bound = - std::min(local_range - id_in_sg, max_size - local_id); + std::min(local_range - id_in_sg, max_size - local_id); +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int stride) { + if (stride < local_range) { + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_left(sg, local_value, + stride); + if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(local_range <= 32); +#else for (unsigned int stride = 1; stride < local_range; stride <<= 1) { - auto tmp = sg.shuffle_down(local_value, stride); + auto tmp = + Kokkos::Impl::SYCLReduction::shift_group_left(sg, local_value, stride); if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp); } +#endif // Copy the subgroup results into the first positions of the // reduction array. @@ -140,7 +201,7 @@ std::enable_if_t> workgroup_reduction( // the first subgroup, we first combine the items with a higher // index. if (n_active_subgroups > local_range) { - for (unsigned int offset = local_range; offset < n_active_subgroups; + for (int offset = local_range; offset < n_active_subgroups; offset += local_range) if (id_in_sg + offset < n_active_subgroups) { final_reducer.join(&sg_value, &local_mem[(id_in_sg + offset)]); @@ -149,11 +210,29 @@ std::enable_if_t> workgroup_reduction( } // Then, we proceed as before. +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine_sg = [&](int stride) { + if (stride < local_range) { + auto tmp = + Kokkos::Impl::SYCLReduction::shift_group_left(sg, sg_value, stride); + if (id_in_sg + stride < n_active_subgroups) + final_reducer.join(&sg_value, &tmp); + } + }; + shuffle_combine_sg(1); + shuffle_combine_sg(2); + shuffle_combine_sg(4); + shuffle_combine_sg(8); + shuffle_combine_sg(16); + KOKKOS_ASSERT(local_range <= 32); +#else for (unsigned int stride = 1; stride < local_range; stride <<= 1) { - auto tmp = sg.shuffle_down(sg_value, stride); + auto tmp = + Kokkos::Impl::SYCLReduction::shift_group_left(sg, sg_value, stride); if (id_in_sg + stride < n_active_subgroups) final_reducer.join(&sg_value, &tmp); } +#endif // Finally, we copy the workgroup results back to global memory // to be used in the next iteration. If this is the last diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp index 39b201976b5..44d797f1ccc 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp @@ -35,6 +35,9 @@ namespace Kokkos { namespace Impl { +std::vector SerialInternal::all_instances; +std::mutex SerialInternal::all_instances_mutex; + bool SerialInternal::is_initialized() { return m_is_initialized; } void SerialInternal::initialize() { @@ -43,6 +46,12 @@ void SerialInternal::initialize() { Impl::SharedAllocationRecord::tracking_enable(); m_is_initialized = true; + + // guard pushing to all_instances + { + std::scoped_lock lock(all_instances_mutex); + all_instances.push_back(this); + } } void SerialInternal::finalize() { @@ -59,6 +68,17 @@ void SerialInternal::finalize() { } m_is_initialized = false; + + // guard erasing from all_instances + { + std::scoped_lock lock(all_instances_mutex); + auto it = std::find(all_instances.begin(), all_instances.end(), this); + if (it == all_instances.end()) + Kokkos::abort( + "Execution space instance to be removed couldn't be found!"); + std::swap(*it, all_instances.back()); + all_instances.pop_back(); + } } SerialInternal& SerialInternal::singleton() { @@ -97,9 +117,12 @@ void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes, m_thread_team_data.disband_team(); m_thread_team_data.disband_pool(); - space.deallocate("Kokkos::Serial::scratch_mem", - m_thread_team_data.scratch_buffer(), - m_thread_team_data.scratch_bytes()); + // impl_deallocate doesn't fence which we try to avoid here since that + // interferes with the using the m_instance_mutex for ensuring proper + // kernel enqueuing + space.impl_deallocate("Kokkos::Serial::scratch_mem", + m_thread_team_data.scratch_buffer(), + m_thread_team_data.scratch_bytes()); } if (pool_reduce_bytes < old_pool_reduce) { @@ -119,13 +142,7 @@ void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes, HostThreadTeamData::scratch_size(pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, thread_local_bytes); - void* ptr = nullptr; - try { - ptr = space.allocate("Kokkos::Serial::scratch_mem", alloc_bytes); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - // For now, just rethrow the error message the existing way - Kokkos::Impl::throw_runtime_exception(failure.get_error_message()); - } + void* ptr = space.allocate("Kokkos::Serial::scratch_mem", alloc_bytes); m_thread_team_data.scratch_assign(static_cast(ptr), alloc_bytes, pool_reduce_bytes, team_reduce_bytes, @@ -147,7 +164,9 @@ Serial::Serial(NewInstance) : m_space_instance(new Impl::SerialInternal, [](Impl::SerialInternal* ptr) { ptr->finalize(); delete ptr; - }) {} + }) { + m_space_instance->initialize(); +} void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp index 43eb4992ed7..81d43b31b35 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -60,7 +60,10 @@ class SerialInternal { static SerialInternal& singleton(); - std::mutex m_thread_team_data_mutex; + std::mutex m_instance_mutex; + + static std::vector all_instances; + static std::mutex all_instances_mutex; // Resize thread team data scratch memory void resize_thread_team_data(size_t pool_reduce_bytes, @@ -113,7 +116,15 @@ class Serial { Serial(); - Serial(NewInstance); + explicit Serial(NewInstance); + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template + KOKKOS_DEPRECATED_WITH_COMMENT( + "Serial execution space should be constructed explicitly.") + Serial(NewInstance) + : Serial(NewInstance{}) {} +#endif /// \brief True if and only if this method is being called in a /// thread-parallel function. @@ -137,7 +148,14 @@ class Serial { name, Kokkos::Tools::Experimental::SpecialSynchronizationCases:: GlobalDeviceSynchronization, - []() {}); // TODO: correct device ID + []() { + std::lock_guard lock_all_instances( + Impl::SerialInternal::all_instances_mutex); + for (auto* instance_ptr : Impl::SerialInternal::all_instances) { + std::lock_guard lock_instance( + instance_ptr->m_instance_mutex); + } + }); // TODO: correct device ID Kokkos::memory_fence(); } @@ -145,7 +163,10 @@ class Serial { "Kokkos::Serial::fence: Unnamed Instance Fence") const { Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, - []() {}); // TODO: correct device ID + [this]() { + auto* internal_instance = this->impl_internal_space_instance(); + std::lock_guard lock(internal_instance->m_instance_mutex); + }); // TODO: correct device ID Kokkos::memory_fence(); } diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 67978aa3e9f..34e115eca9b 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -43,7 +43,14 @@ class ParallelFor, } public: - inline void execute() const { this->exec(); } + inline void execute() const { + // Make sure kernels are running sequentially even when using multiple + // threads + auto* internal_instance = +; + std::lock_guard lock(internal_instance->m_instance_mutex); + this->exec(); + } template static int max_tile_size_product(const Policy&, const Functor&) { /** @@ -104,9 +111,11 @@ class ParallelReduce lock( - internal_instance->m_thread_team_data_mutex); + + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard instance_lock( + internal_instance->m_instance_mutex); internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 91b4c567113..80faec9041d 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -49,6 +49,10 @@ class ParallelFor, Kokkos::Serial> { public: inline void execute() const { + // Make sure kernels are running sequentially even when using multiple + // threads + auto* internal_instance =; + std::lock_guard lock(internal_instance->m_instance_mutex); this->template exec(); } @@ -103,9 +107,11 @@ class ParallelReduce, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance =; - // Need to lock resize_thread_team_data - std::lock_guard lock( - internal_instance->m_thread_team_data_mutex); + + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard instance_lock( + internal_instance->m_instance_mutex); internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -187,10 +193,12 @@ class ParallelScan, const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks - // Need to lock resize_thread_team_data auto* internal_instance =; - std::lock_guard lock( - internal_instance->m_thread_team_data_mutex); + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard instance_lock( + internal_instance->m_instance_mutex); + internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -253,10 +261,12 @@ class ParallelScanWithTotal, const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks - // Need to lock resize_thread_team_data auto* internal_instance =; - std::lock_guard lock( - internal_instance->m_thread_team_data_mutex); + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard instance_lock( + internal_instance->m_instance_mutex); + internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index a25b51496ef..a523cc86c97 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -247,9 +247,11 @@ class ParallelFor, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance =; - // Need to lock resize_thread_team_data - std::lock_guard lock( - internal_instance->m_thread_team_data_mutex); + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard instance_lock( + internal_instance->m_instance_mutex); + internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -319,9 +321,11 @@ class ParallelReduce lock( - internal_instance->m_thread_team_data_mutex); + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard instance_lock( + internal_instance->m_instance_mutex); + internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index fd0f221365b..a3501a437d2 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -188,8 +188,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return value; - if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -229,8 +227,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return; - type* const local_value = ((type*)m_instance->scratch_memory()); // Set this thread's contribution @@ -285,8 +281,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return type(0); - volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -358,6 +352,7 @@ class ThreadsExecTeamMember { m_chunk_size(team.chunk_size()), m_league_chunk_end(0), m_team_alloc(team.team_alloc()) { + KOKKOS_ASSERT(m_instance != nullptr); if (team.league_size()) { // Execution is using device-team interface: diff --git a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp new file mode 100644 index 00000000000..95cb6f619cc --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -0,0 +1,318 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_VIEW_ALLOC_HPP +#define KOKKOS_VIEW_ALLOC_HPP + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace Kokkos::Impl { + +template +bool is_zero_byte(const T& x) { + constexpr std::byte all_zeroes[sizeof(T)] = {}; + return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; +} + +//---------------------------------------------------------------------------- + +/* + * The construction, assignment to default, and destruction + * are merged into a single functor. + * Primarily to work around an unresolved CUDA back-end bug + * that would lose the destruction cuda device function when + * called from the shared memory tracking destruction. + * Secondarily to have two fewer partial specializations. + */ +template ::value> +struct ViewValueFunctor; + +template +struct ViewValueFunctor { + using ExecSpace = typename DeviceType::execution_space; + + struct DestroyTag {}; + struct ConstructTag {}; + + ExecSpace space; + ValueType* ptr; + size_t n; + std::string name; + bool default_exec_space; + + template + KOKKOS_INLINE_FUNCTION + std::enable_if_t::value> + operator()(ConstructTag const&, const size_t i) const { + new (ptr + i) ValueType(); + } + + KOKKOS_INLINE_FUNCTION void operator()(DestroyTag const&, + const size_t i) const { + (ptr + i)->~ValueType(); + } + + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor& operator=(const ViewValueFunctor&) = default; + + ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, + size_t const arg_n, std::string arg_name) + : space(arg_space), + ptr(arg_ptr), + n(arg_n), + name(std::move(arg_name)), + default_exec_space(false) { + functor_instantiate_workaround(); + } + + ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, + std::string arg_name) + : space(ExecSpace{}), + ptr(arg_ptr), + n(arg_n), + name(std::move(arg_name)), + default_exec_space(true) { + functor_instantiate_workaround(); + } + + template + std::enable_if_t::value && + std::is_trivially_copy_assignable::value> + construct_dispatch() { + ValueType value{}; +// On A64FX memset seems to do the wrong thing with regards to first touch +// leading to the significant performance issues +#ifndef KOKKOS_ARCH_A64FX + if (Impl::is_zero_byte(value)) { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + // We are not really using parallel_for here but using beginParallelFor + // instead of begin_parallel_for (and adding "via memset") is the best + // we can do to indicate that this is not supposed to be tunable (and + // doesn't really execute a parallel_for). + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "] via memset", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } + (void)ZeroMemset( + space, Kokkos::View>(ptr, n)); + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + if (default_exec_space) + space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); + } else { +#endif + parallel_for_implementation(); +#ifndef KOKKOS_ARCH_A64FX + } +#endif + } + + template + std::enable_if_t::value && + std::is_trivially_copy_assignable::value)> + construct_dispatch() { + parallel_for_implementation(); + } + + template + void parallel_for_implementation() { + using PolicyType = + Kokkos::RangePolicy, Tag>; + PolicyType policy(space, 0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + const std::string functor_name = + (std::is_same_v + ? "Kokkos::View::destruction [" + name + "]" + : "Kokkos::View::initialization [" + name + "]"); + Kokkos::Profiling::beginParallelFor( + functor_name, Kokkos::Profiling::Experimental::device_id(space), + &kpID); + } + +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } +#endif + const Kokkos::Impl::ParallelFor closure( + *this, policy); + closure.execute(); + if (default_exec_space || std::is_same_v) + space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + } + + void construct_shared_allocation() { construct_dispatch(); } + + void destroy_shared_allocation() { +#ifdef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND + if constexpr (std::is_same_v) + for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); + else +#endif + { + parallel_for_implementation(); + } + } + + // This function is to ensure that the functor with DestroyTag is instantiated + // This is a workaround to avoid "cudaErrorInvalidDeviceFunction" error later + // when the function is queried with cudaFuncGetAttributes + void functor_instantiate_workaround() { +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) + if (false) { + parallel_for_implementation(); + } +#endif + } +}; + +template +struct ViewValueFunctor { + using ExecSpace = typename DeviceType::execution_space; + using PolicyType = Kokkos::RangePolicy>; + + ExecSpace space; + ValueType* ptr; + size_t n; + std::string name; + bool default_exec_space; + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i) const { ptr[i] = ValueType(); } + + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor& operator=(const ViewValueFunctor&) = default; + + ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, + size_t const arg_n, std::string arg_name) + : space(arg_space), + ptr(arg_ptr), + n(arg_n), + name(std::move(arg_name)), + default_exec_space(false) {} + + ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, + std::string arg_name) + : space(ExecSpace{}), + ptr(arg_ptr), + n(arg_n), + name(std::move(arg_name)), + default_exec_space(true) {} + + template + std::enable_if_t::value && + std::is_trivially_copy_assignable::value> + construct_shared_allocation() { + // Shortcut for zero initialization +// On A64FX memset seems to do the wrong thing with regards to first touch +// leading to the significant performance issues +#ifndef KOKKOS_ARCH_A64FX + ValueType value{}; + if (Impl::is_zero_byte(value)) { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + // We are not really using parallel_for here but using beginParallelFor + // instead of begin_parallel_for (and adding "via memset") is the best + // we can do to indicate that this is not supposed to be tunable (and + // doesn't really execute a parallel_for). + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "] via memset", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } + + (void)ZeroMemset( + space, Kokkos::View>(ptr, n)); + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + if (default_exec_space) + space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); + } else { +#endif + parallel_for_implementation(); +#ifndef KOKKOS_ARCH_A64FX + } +#endif + } + + template + std::enable_if_t::value && + std::is_trivially_copy_assignable::value)> + construct_shared_allocation() { + parallel_for_implementation(); + } + + void parallel_for_implementation() { + PolicyType policy(space, 0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "]", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } +#endif + const Kokkos::Impl::ParallelFor closure( + *this, policy); + closure.execute(); + if (default_exec_space) + space.fence( + "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " + "view"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + } + + void destroy_shared_allocation() {} +}; +} // namespace Kokkos::Impl + +#endif // KOKKOS_VIEW_ALLOC_HPP diff --git a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Accessor.hpp b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Accessor.hpp new file mode 100644 index 00000000000..8814cc015ef --- /dev/null +++ b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Accessor.hpp @@ -0,0 +1,220 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_MDSPAN_ACCESSOR_HPP +#define KOKKOS_MDSPAN_ACCESSOR_HPP + +#include +#include +#include +#include + +namespace Kokkos { + +// For now use the accessors in Impl namespace, as an +// implementation detail for rebasing View on mdspan +namespace Impl { + +template +struct SpaceAwareAccessor { + // Part of Accessor Requirements + using element_type = typename NestedAccessor::element_type; + using reference = typename NestedAccessor::reference; + using data_handle_type = typename NestedAccessor::data_handle_type; + using offset_policy = + SpaceAwareAccessor; + + // Specific to SpaceAwareAccessor + using memory_space = MemorySpace; + using nested_accessor_type = NestedAccessor; + + static_assert(is_memory_space_v); + + KOKKOS_DEFAULTED_FUNCTION + constexpr SpaceAwareAccessor() = default; + + template < + class OtherMemorySpace, class OtherNestedAccessorType, + std::enable_if_t< + MemorySpaceAccess::assignable && + std::is_constructible_v, + int> = 0> + KOKKOS_FUNCTION constexpr SpaceAwareAccessor( + const SpaceAwareAccessor& + other) noexcept + : nested_acc(other.nested_acc) {} + + KOKKOS_FUNCTION + SpaceAwareAccessor(const NestedAccessor& acc) : nested_acc(acc) {} + + KOKKOS_FUNCTION + explicit operator NestedAccessor() const { return nested_acc; } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const noexcept { + Kokkos::Impl::runtime_check_memory_access_violation( + "Kokkos::SpaceAwareAccessor ERROR: attempt to access inaccessible " + "memory space"); + return nested_acc.access(p, i); + } + + KOKKOS_FUNCTION + constexpr typename offset_policy::data_handle_type offset(data_handle_type p, + size_t i) const + noexcept { + return nested_acc.offset(p, i); + } + + // Canonical way for accessing nested accessor see ISO C++ + // [linalg.scaled.scaledaccessor] + KOKKOS_FUNCTION + constexpr const NestedAccessor& nested_accessor() const noexcept { + return nested_acc; + } + + private: +// We either compile with our custom mdspan impl +// in which case we discover inside it whether no_unique_address +// works, or we use C++23 in which case it better be available +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor nested_acc; + template + friend struct SpaceAwareAccessor; +}; + +template +struct SpaceAwareAccessor { + // Part of Accessor Requirements + using element_type = typename NestedAccessor::element_type; + using reference = typename NestedAccessor::reference; + using data_handle_type = typename NestedAccessor::data_handle_type; + + using offset_policy = + SpaceAwareAccessor; + + // Specific to SpaceAwareAccessor + using memory_space = AnonymousSpace; + using nested_accessor_type = NestedAccessor; + + KOKKOS_DEFAULTED_FUNCTION + constexpr SpaceAwareAccessor() = default; + + template , + int> = 0> + KOKKOS_FUNCTION constexpr SpaceAwareAccessor( + const SpaceAwareAccessor& + other) noexcept + : nested_acc(other.nested_acc) {} + + KOKKOS_FUNCTION + SpaceAwareAccessor(const NestedAccessor& acc) : nested_acc(acc) {} + + KOKKOS_FUNCTION + explicit operator NestedAccessor() const { return nested_acc; } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const noexcept { + return nested_acc.access(p, i); + } + + KOKKOS_FUNCTION + constexpr typename offset_policy::data_handle_type offset(data_handle_type p, + size_t i) const + noexcept { + return nested_acc.offset(p, i); + } + + // Canonical way for accessing nested accessor see ISO C++ + // [linalg.scaled.scaledaccessor] + KOKKOS_FUNCTION + constexpr const NestedAccessor& nested_accessor() const noexcept { + return nested_acc; + } + + private: +// We either compile with our custom mdspan impl +// in which case we discover inside it whether no_unique_address +// works, or we use C++23 in which case it better be available +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor nested_acc; + template + friend struct SpaceAwareAccessor; +}; + +// Like atomic_accessor_relaxed proposed for ISO C++26 but with +// defaulted memory scope - similar to how desul's AtomicRef has a memory scope +template +struct AtomicAccessorRelaxed { + using element_type = ElementType; + using reference = + desul::AtomicRef; + using data_handle_type = ElementType*; + using offset_policy = AtomicAccessorRelaxed; + + KOKKOS_DEFAULTED_FUNCTION + AtomicAccessorRelaxed() = default; + + // Conversions from non-const to const element type + template >* = nullptr> + KOKKOS_FUNCTION constexpr AtomicAccessorRelaxed( + Kokkos::default_accessor) noexcept {} + + template >* = nullptr> + KOKKOS_FUNCTION constexpr AtomicAccessorRelaxed( + AtomicAccessorRelaxed) noexcept {} + + template >* = nullptr> + KOKKOS_FUNCTION explicit operator default_accessor() const { + return default_accessor{}; + } + + KOKKOS_FUNCTION + reference access(data_handle_type p, size_t i) const noexcept { + return reference(p[i]); + } + + KOKKOS_FUNCTION + data_handle_type offset(data_handle_type p, size_t i) const noexcept { + return p + i; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp index 3846b52d239..29d1e00adfc 100644 --- a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp +++ b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp @@ -37,9 +37,6 @@ struct ViewDimension; template struct ViewDataType; -} // namespace Kokkos::Impl - -namespace Kokkos::Experimental::Impl { // A few things to note -- // - mdspan allows for 0-rank extents similarly to View, so we don't need @@ -106,6 +103,20 @@ struct DataTypeFromExtents { // Will cause a compile error if it is malformed (i.e. dynamic after static) using type = typename ::Kokkos::Impl::ViewDataType::type; }; -} // namespace Kokkos::Experimental::Impl + +template +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping_impl( + const VM &view_mapping, std::index_sequence) { + return Extents{view_mapping.extent(Indices)...}; +} + +template +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping( + const VM &view_mapping) { + static_assert(Extents::rank() == VM::Rank); + return extents_from_view_mapping_impl( + view_mapping, std::make_index_sequence{}); +} +} // namespace Kokkos::Impl #endif // KOKKOS_EXPERIMENTAL_MDSPAN_EXTENTS_HPP diff --git a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp new file mode 100644 index 00000000000..089628137d7 --- /dev/null +++ b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -0,0 +1,156 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP +#define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP + +#include "Kokkos_MDSpan_Extents.hpp" +#include + +namespace Kokkos::Impl { + +template +struct LayoutFromArrayLayout; + +template <> +struct LayoutFromArrayLayout { + using type = Kokkos::Experimental::layout_left_padded; +}; + +template <> +struct LayoutFromArrayLayout { + using type = Kokkos::Experimental::layout_right_padded; +}; + +template <> +struct LayoutFromArrayLayout { + using type = layout_stride; +}; + +template +KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( + const typename MDSpanType::mapping_type &mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + constexpr auto rank = extents_type::rank(); + const auto &ext = mapping.extents(); + + static_assert(rank <= ARRAY_LAYOUT_MAX_RANK, + "Unsupported rank for mdspan (must be <= 8)"); + + if constexpr (std::is_same_v) { + return Kokkos::LayoutStride{ + rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 0 ? mapping.stride(0) : 0, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? mapping.stride(1) : 0, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? mapping.stride(2) : 0, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? mapping.stride(3) : 0, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? mapping.stride(4) : 0, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? mapping.stride(5) : 0, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? mapping.stride(6) : 0, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? mapping.stride(7) : 0, + }; + } else { + // FIXME: Kokkos Layouts don't store stride (it's in the mapping) + // We could conceivably fix this by adding an extra ViewCtorProp for + // an abritrary padding. For now we will check for this. + if constexpr (rank > 1 && + (std::is_same_v> || + std::is_same_v>)) { + [[maybe_unused]] constexpr size_t strided_index = + std::is_same_v< + typename mapping_type::layout_type, + Kokkos::Experimental::layout_left_padded> + ? 1 + : rank - 2; + [[maybe_unused]] constexpr size_t extent_index = + std::is_same_v< + typename mapping_type::layout_type, + Kokkos::Experimental::layout_left_padded> + ? 0 + : rank - 1; + KOKKOS_ASSERT(mapping.stride(strided_index) == ext.extent(extent_index)); + } + + return ArrayLayout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +template +KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + // std::span is not available in C++17 (our current requirements), + // so we need to use the std::array constructor for layout mappings. + // FIXME When C++20 is available, we can use std::span here instead + std::size_t strides[VM::Rank]; + view_mapping.stride_fill(&strides[0]); + if constexpr (std::is_same_v) { + return mapping_type(Kokkos::mdspan_non_standard, + extents_from_view_mapping(view_mapping), + strides); + } else if constexpr (VM::Rank > 1 && + std::is_same_v>) { + return mapping_type(extents_from_view_mapping(view_mapping), + strides[1]); + } else if constexpr (VM::Rank > 1 && + std::is_same_v>) { + return mapping_type(extents_from_view_mapping(view_mapping), + strides[VM::Rank - 2]); + } else { + return mapping_type(extents_from_view_mapping(view_mapping)); + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +} // namespace Kokkos::Impl + +#endif // KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp index bd12c5c6a99..d13c90825c5 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -19,6 +19,9 @@ #if defined(KOKKOS_ENABLE_SYCL) #include +#ifdef SYCL_EXT_ONEAPI_GRAPH +#include +#endif #include #include #include diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index c7addbe3376..6f862718bcb 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -91,6 +91,7 @@ void combine(Kokkos::InitializationSettings& out, KOKKOS_IMPL_COMBINE_SETTING(map_device_id_by); KOKKOS_IMPL_COMBINE_SETTING(device_id); KOKKOS_IMPL_COMBINE_SETTING(disable_warnings); + KOKKOS_IMPL_COMBINE_SETTING(print_configuration); KOKKOS_IMPL_COMBINE_SETTING(tune_internals); KOKKOS_IMPL_COMBINE_SETTING(tools_help); KOKKOS_IMPL_COMBINE_SETTING(tools_libs); @@ -610,6 +611,7 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "no"); #endif + declare_configuration_metadata("architecture", "Default Device", typeid(Kokkos::DefaultExecutionSpace).name()); @@ -750,9 +752,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMD_GFX1100) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX1100"); -#elif defined(KOKKOS_ARCH_AMD_GFX1103) - declare_configuration_metadata("architecture", "GPU architecture", - "AMD_GFX1103"); #else declare_configuration_metadata("architecture", "GPU architecture", "none"); @@ -788,34 +787,18 @@ void initialize_internal(const Kokkos::InitializationSettings& settings) { post_initialize_internal(settings); } -void pre_finalize_internal() { - typename decltype(finalize_hooks)::size_type numSuccessfulCalls = 0; +// declared noexcept such that std::terminate is called if any of the registered +// function throws +void call_registered_finalize_hook_functions() noexcept { while (!finalize_hooks.empty()) { - auto f =; - try { - f(); - } catch (...) { - std::cerr << "Kokkos::finalize: A finalize hook (set via " - "Kokkos::push_finalize_hook) threw an exception that it did " - "not catch." - " Per std::atexit rules, this results in std::terminate. " - "This is " - "finalize hook number " - << numSuccessfulCalls - << " (1-based indexing) " - "out of " - << finalize_hooks.size() - << " to call. Remember that " - "Kokkos::finalize calls finalize hooks in reverse order " - "from how they " - "were pushed." - << std::endl; - std::terminate(); - } + auto const& func =; + func(); finalize_hooks.pop(); - ++numSuccessfulCalls; } +} +void pre_finalize_internal() { + call_registered_finalize_hook_functions(); Kokkos::Profiling::finalize(); } diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp index 3693dff3d46..05d48549193 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -56,7 +56,7 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { //---------------------------------------------------------------------------- // {{{2 - // Not moveable or copyable; it spends its whole live as a shared_ptr in the + // Not movable or copyable; it spends its whole live as a shared_ptr in the // Graph object GraphImpl() = default; GraphImpl(GraphImpl const&) = delete; @@ -82,10 +82,7 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { template // requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl void add_node(std::shared_ptr const& arg_node_ptr) { - static_assert( - NodeImpl::kernel_type::Policy::is_graph_kernel::value, - "Something has gone horribly wrong, but it's too complicated to " - "explain here. Buy Daisy a coffee and she'll explain it to you."); + static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); // Since this is always called before any calls to add_predecessor involving // it, we can treat this node as a sink until we discover otherwise. arg_node_ptr->node_details_t::set_kernel(arg_node_ptr->get_kernel()); diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp b/lib/kokkos/core/src/impl/Kokkos_DesulAtomicsConfig.hpp similarity index 72% rename from lib/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp rename to lib/kokkos/core/src/impl/Kokkos_DesulAtomicsConfig.hpp index 4cf170f5f13..02ab127d5c5 100644 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_DesulAtomicsConfig.hpp @@ -13,15 +13,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_ATOMICS_DESUL_CONFIG_HPP -#define KOKKOS_ATOMICS_DESUL_CONFIG_HPP -#include +#ifndef KOKKOS_DESUL_ATOMICS_CONFIG_HPP +#define KOKKOS_DESUL_ATOMICS_CONFIG_HPP #if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) #define DESUL_CUDA_ARCH_IS_PRE_PASCAL @@ -32,4 +26,4 @@ static_assert(false, #define DESUL_CUDA_ARCH_IS_PRE_VOLTA #endif -#endif // KOKKOS_ATOMICS_DESUL_CONFIG_HPP +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.cpp b/lib/kokkos/core/src/impl/Kokkos_Error.cpp index de6e83ed1f2..0dcd5d523d3 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Error.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp @@ -18,133 +18,54 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif -#include -#include - #include -#include #include +#include #include #include // show_warnings #include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -void throw_runtime_exception(const std::string &msg) { +void Kokkos::Impl::throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } -void log_warning(const std::string &msg) { +void Kokkos::Impl::throw_bad_alloc(std::string_view memory_space_name, + std::size_t size, std::string_view label) { + std::stringstream ss; + ss << "Kokkos ERROR: " << memory_space_name + << " memory space failed to allocate " << human_memory_size(size) + << " (label=\"" << label << "\")."; + throw std::runtime_error(ss.str()); +} + +void Kokkos::Impl::log_warning(const std::string &msg) { if (show_warnings()) { std::cerr << msg << std::flush; } } -std::string human_memory_size(size_t arg_bytes) { +std::string Kokkos::Impl::human_memory_size(size_t arg_bytes) { double bytes = arg_bytes; const double K = 1024; const double M = K * 1024; const double G = M * 1024; + const double T = G * 1024; std::ostringstream out; if (bytes < K) { out << std::setprecision(4) << bytes << " B"; } else if (bytes < M) { bytes /= K; - out << std::setprecision(4) << bytes << " K"; + out << std::setprecision(4) << bytes << " KiB"; } else if (bytes < G) { bytes /= M; - out << std::setprecision(4) << bytes << " M"; - } else { + out << std::setprecision(4) << bytes << " MiB"; + } else if (bytes < T) { bytes /= G; - out << std::setprecision(4) << bytes << " G"; - } - return out.str(); -} - -} // namespace Impl - -void Experimental::RawMemoryAllocationFailure::print_error_message( - std::ostream &o) const { - o << "Allocation of size " - << ::Kokkos::Impl::human_memory_size(m_attempted_size); - o << " failed"; - switch (m_failure_mode) { - case FailureMode::OutOfMemoryError: - o << ", likely due to insufficient memory."; - break; - case FailureMode::AllocationNotAligned: - o << " because the allocation was improperly aligned."; - break; - case FailureMode::InvalidAllocationSize: - o << " because the requested allocation size is not a valid size for the" - " requested allocation mechanism (it's probably too large)."; - break; - // TODO move this to the subclass for Cuda-related things - case FailureMode::MaximumCudaUVMAllocationsExceeded: - o << " because the maximum Cuda UVM allocations was exceeded."; - break; - case FailureMode::Unknown: o << " because of an unknown error."; break; - } - o << " (The allocation mechanism was "; - switch (m_mechanism) { - case AllocationMechanism::StdMalloc: o << "standard malloc()."; break; - case AllocationMechanism::CudaMalloc: o << "cudaMalloc()."; break; - case AllocationMechanism::CudaMallocManaged: - o << "cudaMallocManaged()."; - break; - case AllocationMechanism::CudaHostAlloc: o << "cudaHostAlloc()."; break; - case AllocationMechanism::HIPMalloc: o << "hipMalloc()."; break; - case AllocationMechanism::HIPHostMalloc: o << "hipHostMalloc()."; break; - case AllocationMechanism::HIPMallocManaged: - o << "hipMallocManaged()."; - break; - case AllocationMechanism::SYCLMallocDevice: - o << "sycl::malloc_device()."; - break; - case AllocationMechanism::SYCLMallocShared: - o << "sycl::malloc_shared()."; - break; - case AllocationMechanism::SYCLMallocHost: - o << "sycl::malloc_host()."; - break; - default: o << "unsupported."; + out << std::setprecision(4) << bytes << " GiB"; + } else { + bytes /= T; + out << std::setprecision(4) << bytes << " TiB"; } - append_additional_error_information(o); - o << ")" << std::endl; -} - -std::string Experimental::RawMemoryAllocationFailure::get_error_message() - const { - std::ostringstream out; - print_error_message(out); return out.str(); } - -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -#ifdef KOKKOS_ENABLE_CUDA -namespace Experimental { - -void CudaRawMemoryAllocationFailure::append_additional_error_information( - std::ostream &o) const { - if (m_error_code != cudaSuccess) { - o << " The Cuda allocation returned the error code \"" - << cudaGetErrorName(m_error_code) << "\"."; - } -} - -} // end namespace Experimental -#endif - -} // namespace Kokkos diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.hpp b/lib/kokkos/core/src/impl/Kokkos_Error.hpp index 1058fd98dbf..9a80c7b31b8 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Error.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp @@ -18,116 +18,19 @@ #define KOKKOS_IMPL_ERROR_HPP #include -#include #include #include #include -namespace Kokkos { -namespace Impl { +namespace Kokkos::Impl { [[noreturn]] void throw_runtime_exception(const std::string &msg); - +[[noreturn]] void throw_bad_alloc(std::string_view memory_space_name, + std::size_t size, std::string_view label); void log_warning(const std::string &msg); -std::string human_memory_size(size_t arg_bytes); - -} // namespace Impl +std::string human_memory_size(size_t bytes); -namespace Experimental { +} // namespace Kokkos::Impl -class RawMemoryAllocationFailure : public std::bad_alloc { - public: - enum class FailureMode { - OutOfMemoryError, - AllocationNotAligned, - InvalidAllocationSize, - MaximumCudaUVMAllocationsExceeded, - Unknown - }; - enum class AllocationMechanism { - StdMalloc, -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - PosixMemAlign KOKKOS_DEPRECATED, - PosixMMap KOKKOS_DEPRECATED, - IntelMMAlloc KOKKOS_DEPRECATED, #endif - CudaMalloc, - CudaMallocManaged, - CudaHostAlloc, - HIPMalloc, - HIPHostMalloc, - HIPMallocManaged, - SYCLMallocDevice, - SYCLMallocShared, - SYCLMallocHost, - OpenACCMalloc, - }; - - private: - size_t m_attempted_size; - size_t m_attempted_alignment; - FailureMode m_failure_mode; - AllocationMechanism m_mechanism; - - public: - RawMemoryAllocationFailure( - size_t arg_attempted_size, size_t arg_attempted_alignment, - FailureMode arg_failure_mode = FailureMode::OutOfMemoryError, - AllocationMechanism arg_mechanism = - AllocationMechanism::StdMalloc) noexcept - : m_attempted_size(arg_attempted_size), - m_attempted_alignment(arg_attempted_alignment), - m_failure_mode(arg_failure_mode), - m_mechanism(arg_mechanism) {} - - RawMemoryAllocationFailure() noexcept = delete; - - RawMemoryAllocationFailure(RawMemoryAllocationFailure const &) noexcept = - default; - RawMemoryAllocationFailure(RawMemoryAllocationFailure &&) noexcept = default; - - RawMemoryAllocationFailure &operator =( - RawMemoryAllocationFailure const &) noexcept = default; - RawMemoryAllocationFailure &operator =( - RawMemoryAllocationFailure &&) noexcept = default; - - ~RawMemoryAllocationFailure() noexcept override = default; - - [[nodiscard]] const char *what() const noexcept override { - if (m_failure_mode == FailureMode::OutOfMemoryError) { - return "Memory allocation error: out of memory"; - } else if (m_failure_mode == FailureMode::AllocationNotAligned) { - return "Memory allocation error: allocation result was under-aligned"; - } - - return nullptr; // unreachable - } - - [[nodiscard]] size_t attempted_size() const noexcept { - return m_attempted_size; - } - - [[nodiscard]] size_t attempted_alignment() const noexcept { - return m_attempted_alignment; - } - - [[nodiscard]] AllocationMechanism allocation_mechanism() const noexcept { - return m_mechanism; - } - - [[nodiscard]] FailureMode failure_mode() const noexcept { - return m_failure_mode; - } - - void print_error_message(std::ostream &o) const; - [[nodiscard]] std::string get_error_message() const; - - virtual void append_additional_error_information(std::ostream &) const {} -}; - -} // end namespace Experimental - -} // namespace Kokkos - -#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp index 1047b773d77..1c1fb67ff04 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -79,22 +79,9 @@ void *HostSpace::impl_allocate( ptr = operator new (arg_alloc_size, std::align_val_t(alignment), std::nothrow_t{}); - if ((ptr == nullptr) || (reinterpret_cast(ptr) == ~uintptr_t(0)) || + if (!ptr || (reinterpret_cast(ptr) == ~uintptr_t(0)) || (reinterpret_cast(ptr) & alignment_mask)) { - Experimental::RawMemoryAllocationFailure::FailureMode failure_mode = - Experimental::RawMemoryAllocationFailure::FailureMode:: - AllocationNotAligned; - if (ptr == nullptr) { - failure_mode = Experimental::RawMemoryAllocationFailure::FailureMode:: - OutOfMemoryError; - } - - Experimental::RawMemoryAllocationFailure::AllocationMechanism alloc_mec = - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - StdMalloc; - - throw Kokkos::Experimental::RawMemoryAllocationFailure( - arg_alloc_size, alignment, failure_mode, alloc_mec); + Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); @@ -109,9 +96,8 @@ void HostSpace::deallocate(void *const arg_alloc_ptr, void HostSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, const size_t arg_alloc_size, - const size_t - - arg_logical_size) const { + const size_t arg_logical_size) const { + if (arg_alloc_ptr) Kokkos::fence("HostSpace::impl_deallocate before free"); impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); } void HostSpace::impl_deallocate( @@ -119,7 +105,6 @@ void HostSpace::impl_deallocate( const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { if (arg_alloc_ptr) { - Kokkos::fence("HostSpace::impl_deallocate before free"); size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; if (Kokkos::Profiling::profileLibraryLoaded()) { diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp index 25f09b82865..3dc68a187be 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -106,7 +106,11 @@ class HostThreadTeamData { public: inline bool team_rendezvous() const noexcept { - int* ptr = reinterpret_cast(m_team_scratch + m_team_rendezvous); + // FIXME_OPENMP The tasking framework creates an instance with + // m_team_scratch == nullptr and m_team_rendezvous != 0: + int* ptr = m_team_scratch == nullptr + ? nullptr + : reinterpret_cast(m_team_scratch + m_team_rendezvous); HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step); if (m_team_rank != 0) { HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step); @@ -130,9 +134,13 @@ class HostThreadTeamData { } inline void team_rendezvous_release() const noexcept { + // FIXME_OPENMP The tasking framework creates an instance with + // m_team_scratch == nullptr and m_team_rendezvous != 0: HostBarrier::split_release( - reinterpret_cast(m_team_scratch + m_team_rendezvous), m_team_size, - m_team_rendezvous_step); + (m_team_scratch == nullptr) + ? nullptr + : reinterpret_cast(m_team_scratch + m_team_rendezvous), + m_team_size, m_team_rendezvous_step); } inline int pool_rendezvous() const noexcept { @@ -271,6 +279,9 @@ class HostThreadTeamData { } int64_t* team_shared() const noexcept { + // FIXME_OPENMP The tasking framework creates an instance with + // m_team_scratch == nullptr and m_team_shared != 0 + if (m_team_scratch == nullptr) return nullptr; return m_team_scratch + m_team_shared; } @@ -400,8 +411,12 @@ class HostThreadTeamMember { int const m_league_size; public: + // FIXME_OPENMP The tasking framework creates an instance with + // m_team_scratch == nullptr and m_team_shared != 0: constexpr HostThreadTeamMember(HostThreadTeamData& arg_data) noexcept - : m_scratch(arg_data.team_shared(), arg_data.team_shared_bytes()), + : m_scratch(arg_data.team_shared(), (arg_data.team_shared() == nullptr) + ? 0 + : arg_data.team_shared_bytes()), m_data(arg_data), m_league_rank(arg_data.m_league_rank), m_league_size(arg_data.m_league_size) {} diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp index bc6197753c3..0b346530173 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp @@ -971,84 +971,6 @@ void set_callbacks(Kokkos::Tools::Experimental::EventSet new_events) { } // namespace Experimental } // namespace Tools -namespace Profiling { -bool profileLibraryLoaded() { return Kokkos::Tools::profileLibraryLoaded(); } - -void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID) { - Kokkos::Tools::beginParallelFor(kernelPrefix, devID, kernelID); -} -void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID) { - Kokkos::Tools::beginParallelReduce(kernelPrefix, devID, kernelID); -} -void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID) { - Kokkos::Tools::beginParallelScan(kernelPrefix, devID, kernelID); -} -void endParallelFor(const uint64_t kernelID) { - Kokkos::Tools::endParallelFor(kernelID); -} -void endParallelReduce(const uint64_t kernelID) { - Kokkos::Tools::endParallelReduce(kernelID); -} -void endParallelScan(const uint64_t kernelID) { - Kokkos::Tools::endParallelScan(kernelID); -} - -void pushRegion(const std::string& kName) { Kokkos::Tools::pushRegion(kName); } -void popRegion() { Kokkos::Tools::popRegion(); } - -void createProfileSection(const std::string& sectionName, uint32_t* secID) { - Kokkos::Tools::createProfileSection(sectionName, secID); -} -void destroyProfileSection(const uint32_t secID) { - Kokkos::Tools::destroyProfileSection(secID); -} - -void startSection(const uint32_t secID) { Kokkos::Tools::startSection(secID); } - -void stopSection(const uint32_t secID) { Kokkos::Tools::stopSection(secID); } - -void markEvent(const std::string& eventName) { - Kokkos::Tools::markEvent(eventName); -} -void allocateData(const SpaceHandle handle, const std::string name, - const void* data, const uint64_t size) { - Kokkos::Tools::allocateData(handle, name, data, size); -} -void deallocateData(const SpaceHandle space, const std::string label, - const void* ptr, const uint64_t size) { - Kokkos::Tools::deallocateData(space, label, ptr, size); -} - -void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, - const void* dst_ptr, const SpaceHandle src_space, - const std::string src_label, const void* src_ptr, - const uint64_t size) { - Kokkos::Tools::beginDeepCopy(dst_space, dst_label, dst_ptr, src_space, - src_label, src_ptr, size); -} -void endDeepCopy() { Kokkos::Tools::endDeepCopy(); } - -void finalize() { Kokkos::Tools::finalize(); } -void initialize(const std::string& profileLibrary) { - Kokkos::Tools::initialize(profileLibrary); -} - -bool printHelp(const std::string& args) { - return Kokkos::Tools::printHelp(args); -} -void parseArgs(const std::string& args) { Kokkos::Tools::parseArgs(args); } -void parseArgs(int _argc, char** _argv) { - Kokkos::Tools::parseArgs(_argc, _argv); -} - -SpaceHandle make_space_handle(const char* space_name) { - return Kokkos::Tools::make_space_handle(space_name); -} -} // namespace Profiling - // Tuning namespace Tools { diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp index 025d8d3d18e..01a41d0c3fc 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp @@ -263,40 +263,41 @@ size_t get_current_context_id(); } // namespace Tools namespace Profiling { -bool profileLibraryLoaded(); +// don't let ClangFormat reorder the using-declarations below +// clang-format off +using Kokkos::Tools::profileLibraryLoaded; -void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID); -void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID); -void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID); -void endParallelFor(const uint64_t kernelID); -void endParallelReduce(const uint64_t kernelID); -void endParallelScan(const uint64_t kernelID); -void pushRegion(const std::string& kName); -void popRegion(); +using Kokkos::Tools::printHelp; +using Kokkos::Tools::parseArgs; -void createProfileSection(const std::string& sectionName, uint32_t* secID); -void destroyProfileSection(const uint32_t secID); -void startSection(const uint32_t secID); +using Kokkos::Tools::initialize; +using Kokkos::Tools::finalize; -void stopSection(const uint32_t secID); +using Kokkos::Tools::beginParallelFor; +using Kokkos::Tools::beginParallelReduce; +using Kokkos::Tools::beginParallelScan; +using Kokkos::Tools::endParallelFor; +using Kokkos::Tools::endParallelReduce; +using Kokkos::Tools::endParallelScan; -void markEvent(const std::string& eventName); -void allocateData(const SpaceHandle handle, const std::string name, - const void* data, const uint64_t size); -void deallocateData(const SpaceHandle space, const std::string label, - const void* ptr, const uint64_t size); -void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, - const void* dst_ptr, const SpaceHandle src_space, - const std::string src_label, const void* src_ptr, - const uint64_t size); -void endDeepCopy(); -void finalize(); -void initialize(const std::string& = {}); +using Kokkos::Tools::allocateData; +using Kokkos::Tools::deallocateData; + +using Kokkos::Tools::beginDeepCopy; +using Kokkos::Tools::endDeepCopy; + +using Kokkos::Tools::pushRegion; +using Kokkos::Tools::popRegion; + +using Kokkos::Tools::createProfileSection; +using Kokkos::Tools::destroyProfileSection; +using Kokkos::Tools::startSection; +using Kokkos::Tools::stopSection; + +using Kokkos::Tools::markEvent; -SpaceHandle make_space_handle(const char* space_name); +using Kokkos::Tools::make_space_handle; +// clang-format on namespace Experimental { using Kokkos::Tools::Experimental::set_allocate_data_callback; diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h index 15c466b27ed..8c3194e43b5 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h @@ -32,6 +32,10 @@ // Profiling +#ifdef __cplusplus +extern "C" { +#endif + struct Kokkos_Profiling_KokkosPDeviceInfo { size_t deviceID; }; @@ -267,4 +271,8 @@ struct Kokkos_Profiling_EventSet { // changing struct layout }; +#ifdef __cplusplus +} +#endif + #endif // KOKKOS_PROFILING_C_INTERFACE_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp index 0bc3814b3a1..ccf3c47a1ef 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp @@ -323,41 +323,6 @@ void SharedAllocationRecord::print_host_accessible_records( } #endif -void safe_throw_allocation_with_header_failure( - std::string const& space_name, std::string const& label, - Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - auto generate_failure_message = [&](std::ostream& o) { - o << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space_name - << "\" failed with the following error: "; - failure.print_error_message(o); - if (failure.failure_mode() == - Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: - AllocationNotAligned) { - // TODO: delete the misaligned memory? - o << "Warning: Allocation failed due to misalignment; memory may " - "be leaked.\n"; - } - o.flush(); - }; - try { - std::ostringstream sstr; - generate_failure_message(sstr); - Kokkos::Impl::throw_runtime_exception(sstr.str()); - } catch (std::bad_alloc const&) { - // Probably failed to allocate the string because we're so close to out - // of memory. Try printing to std::cerr instead - try { - generate_failure_message(std::cerr); - } catch (std::bad_alloc const&) { - // oh well, we tried... - } - Kokkos::Impl::throw_runtime_exception( - "Kokkos encountered an allocation failure, then another allocation " - "failure while trying to create the error message."); - } -} - void fill_host_accessible_header_info( SharedAllocationRecord* arg_record, SharedAllocationHeader& arg_header, std::string const& arg_label) { diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp index 99ab660213f..da03cc49830 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp @@ -196,36 +196,21 @@ class SharedAllocationRecord { const SharedAllocationRecord* const root, const bool detail); }; -void safe_throw_allocation_with_header_failure( - std::string const& space_name, std::string const& label, - Kokkos::Experimental::RawMemoryAllocationFailure const& failure); - template SharedAllocationHeader* checked_allocation_with_header(MemorySpace const& space, std::string const& label, size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - safe_throw_allocation_with_header_failure(, label, failure); - } - return nullptr; // unreachable + return reinterpret_cast(space.allocate( + label.c_str(), alloc_size + sizeof(SharedAllocationHeader), alloc_size)); } template SharedAllocationHeader* checked_allocation_with_header( ExecutionSpace const& exec_space, MemorySpace const& space, std::string const& label, size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - safe_throw_allocation_with_header_failure(, label, failure); - } - return nullptr; // unreachable + return reinterpret_cast( + space.allocate(exec_space, label.c_str(), + alloc_size + sizeof(SharedAllocationHeader), alloc_size)); } void fill_host_accessible_header_info(SharedAllocationHeader& arg_header, @@ -385,11 +370,9 @@ SharedAllocationRecord template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ MEMORY_SPACE> -namespace { - /* Taking the address of this function so make sure it is unique */ template -void deallocate(SharedAllocationRecord* record_ptr) { +inline void deallocate(SharedAllocationRecord* record_ptr) { using base_type = SharedAllocationRecord; using this_type = SharedAllocationRecord; @@ -401,8 +384,6 @@ void deallocate(SharedAllocationRecord* record_ptr) { delete ptr; } -} // namespace - /* * Memory space specialization of SharedAllocationRecord< Space , void > * requires : @@ -487,15 +468,21 @@ union SharedAllocationTracker { // pressure on compiler optimization by reducing // number of symbols and inline functions. -#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT \ - KOKKOS_IF_ON_HOST((if (!(m_record_bits & DO_NOT_DEREF_FLAG)) { \ - Record::increment(m_record); \ - })) +#ifdef KOKKOS_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY +#define KOKKOS_IMPL_BRANCH_PROB KOKKOS_IMPL_ATTRIBUTE_UNLIKELY +#else +#define KOKKOS_IMPL_BRANCH_PROB +#endif + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT \ + KOKKOS_IF_ON_HOST( \ + (if (!(m_record_bits & DO_NOT_DEREF_FLAG)) \ + KOKKOS_IMPL_BRANCH_PROB { Record::increment(m_record); })) -#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT \ - KOKKOS_IF_ON_HOST((if (!(m_record_bits & DO_NOT_DEREF_FLAG)) { \ - Record::decrement(m_record); \ - })) +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT \ + KOKKOS_IF_ON_HOST( \ + (if (!(m_record_bits & DO_NOT_DEREF_FLAG)) \ + KOKKOS_IMPL_BRANCH_PROB { Record::decrement(m_record); })) #define KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, \ override_tracking) \ @@ -642,8 +629,41 @@ union SharedAllocationTracker { #undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT #undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT +#undef KOKKOS_IMPL_BRANCH_PROB }; +struct SharedAllocationDisableTrackingGuard { + SharedAllocationDisableTrackingGuard() { + KOKKOS_ASSERT( + (Kokkos::Impl::SharedAllocationRecord::tracking_enabled())); + Kokkos::Impl::SharedAllocationRecord::tracking_disable(); + } + + SharedAllocationDisableTrackingGuard( + const SharedAllocationDisableTrackingGuard&) = delete; + SharedAllocationDisableTrackingGuard(SharedAllocationDisableTrackingGuard&&) = + delete; + + ~SharedAllocationDisableTrackingGuard() { + KOKKOS_ASSERT(( + !Kokkos::Impl::SharedAllocationRecord::tracking_enabled())); + Kokkos::Impl::SharedAllocationRecord::tracking_enable(); + } + // clang-format off + // The old version of clang format we use is particularly egregious here + SharedAllocationDisableTrackingGuard& operator=( + const SharedAllocationDisableTrackingGuard&) = delete; + SharedAllocationDisableTrackingGuard& operator=( + SharedAllocationDisableTrackingGuard&&) = delete; + // clang-format on +}; + +template +inline FunctorType construct_with_shared_allocation_tracking_disabled( + Args&&... args) { + [[maybe_unused]] auto guard = SharedAllocationDisableTrackingGuard{}; + return {std::forward(args)...}; +} } /* namespace Impl */ } /* namespace Kokkos */ #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp deleted file mode 100644 index fe43b630184..00000000000 --- a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp +++ /dev/null @@ -1,622 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP -#define KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP - -#include - -namespace Kokkos { -namespace Impl { - -template -struct ViewDataAnalysis> { - private: - using array_analysis = ViewArrayAnalysis; - - static_assert(std::is_void

::value); - static_assert(std::is_same>::value); - static_assert(std::is_scalar::value, - "View of Array type must be of a scalar type"); - - public: - using specialize = Kokkos::Array<>; - - using dimension = typename array_analysis::dimension; - - private: - enum { - is_const = std::is_same::value - }; - - using array_scalar_dimension = typename dimension::template append::type; - - using scalar_type = std::conditional_t; - using non_const_scalar_type = V; - using const_scalar_type = const V; - - public: - using value_type = typename array_analysis::value_type; - using const_value_type = typename array_analysis::const_value_type; - using non_const_value_type = typename array_analysis::non_const_value_type; - - using type = typename ViewDataType::type; - using const_type = typename ViewDataType::type; - using non_const_type = - typename ViewDataType::type; - - using scalar_array_type = - typename ViewDataType::type; - using const_scalar_array_type = - typename ViewDataType::type; - using non_const_scalar_array_type = - typename ViewDataType::type; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -/** \brief View mapping for non-specialized data type and standard layout */ -template -class ViewMapping> { - private: - template - friend class ViewMapping; - template - friend class Kokkos::View; - - using offset_type = ViewOffset; - - using handle_type = typename Traits::value_type::pointer; - - handle_type m_impl_handle; - offset_type m_impl_offset; - size_t m_stride = 0; - - using scalar_type = typename Traits::value_type::value_type; - - using contiguous_reference = Kokkos::Array::contiguous>; - using strided_reference = - Kokkos::Array::strided>; - - enum { - is_contiguous_reference = - (Traits::rank == 0) || (std::is_same::value) - }; - - enum { Array_N = Traits::value_type::size() }; - enum { Array_S = is_contiguous_reference ? Array_N : 1 }; - - KOKKOS_INLINE_FUNCTION - ViewMapping(const handle_type &arg_handle, const offset_type &arg_offset) - : m_impl_handle(arg_handle), - m_impl_offset(arg_offset), - m_stride(is_contiguous_reference ? 0 : arg_offset.span()) {} - - public: - //---------------------------------------- - // Domain dimensions - - static constexpr unsigned Rank = Traits::dimension::rank; - - template - KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType &r) const { - return m_impl_offset.m_dim.extent(r); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - using dim_type = typename offset_type::dimension_type; - return dim_type::static_extent(r); - } - - KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() - const { - return m_impl_offset.layout(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { - return m_impl_offset.dimension_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { - return m_impl_offset.dimension_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { - return m_impl_offset.dimension_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { - return m_impl_offset.dimension_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { - return m_impl_offset.dimension_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { - return m_impl_offset.dimension_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { - return m_impl_offset.dimension_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { - return m_impl_offset.dimension_7(); - } - - // Is a regular layout with uniform striding for each index. - using is_regular = typename offset_type::is_regular; - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_impl_offset.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_impl_offset.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_impl_offset.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_impl_offset.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_impl_offset.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_impl_offset.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_impl_offset.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_impl_offset.stride_7(); - } - - //---------------------------------------- - // Range span - - /** \brief Span of the mapped range */ - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { - return m_impl_offset.span() * Array_N; - } - - /** \brief Is the mapped range span contiguous */ - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_impl_offset.span_is_contiguous(); - } - - using reference_type = - std::conditional_t; - - using pointer_type = handle_type; - - /** \brief If data references are lvalue_reference than can query pointer to - * memory */ - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_impl_handle; - } - - //---------------------------------------- - // The View class performs all rank and bounds checking before - // calling these element reference methods. - - KOKKOS_FORCEINLINE_FUNCTION - reference_type reference() const { - return reference_type(m_impl_handle + 0, Array_N, 0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0) const { - return reference_type(m_impl_handle + m_impl_offset(i0) * Array_S, Array_N, - m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1) const { - return reference_type(m_impl_handle + m_impl_offset(i0, i1) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1, - const I2 &i2) const { - return reference_type(m_impl_handle + m_impl_offset(i0, i1, i2) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3) * Array_S, Array_N, - m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1, - const I2 &i2, - const I3 &i3, - const I4 &i4) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4) * Array_S, Array_N, - m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7) * Array_S, - Array_N, m_stride); - } - - //---------------------------------------- - - private: - enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ }; - enum { MemorySpanSize = sizeof(scalar_type) }; - - public: - /** \brief Span, in bytes, of the referenced memory */ - KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const { - return (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & - ~size_t(MemorySpanMask); - } - - //---------------------------------------- - - KOKKOS_DEFAULTED_FUNCTION ViewMapping() = default; - - //---------------------------------------- - - template - KOKKOS_INLINE_FUNCTION ViewMapping(pointer_type ptr, Args... args) - : m_impl_handle(ptr), - m_impl_offset(std::integral_constant(), args...), - m_stride(m_impl_offset.span()) {} - - //---------------------------------------- - - template - Kokkos::Impl::SharedAllocationRecord<> *allocate_shared( - Kokkos::Impl::ViewCtorProp const &arg_prop, - typename Traits::array_layout const &arg_layout, - bool execution_space_specified) { - using alloc_prop = Kokkos::Impl::ViewCtorProp; - - using execution_space = typename alloc_prop::execution_space; - using memory_space = typename Traits::memory_space; - static_assert( - SpaceAccessibility::accessible); - using functor_type = - ViewValueFunctor; - using record_type = - Kokkos::Impl::SharedAllocationRecord; - - // Query the mapping for byte-size of allocation. - using padding = std::integral_constant< - unsigned int, alloc_prop::allow_padding ? sizeof(scalar_type) : 0>; - - m_impl_offset = offset_type(padding(), arg_layout); - - const size_t alloc_size = - (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & - ~size_t(MemorySpanMask); - const auto &alloc_name = Impl::get_property(arg_prop); - const execution_space &exec_space = - Impl::get_property(arg_prop); - const memory_space &mem_space = - Impl::get_property(arg_prop); - - // Allocate memory from the memory space and create tracking record. - record_type *const record = - execution_space_specified - ? record_type::allocate(exec_space, mem_space, alloc_name, - alloc_size) - : record_type::allocate(mem_space, alloc_name, alloc_size); - - m_impl_handle = handle_type(reinterpret_cast(record->data())); - - functor_type functor = - execution_space_specified - ? functor_type(exec_space, (pointer_type)m_impl_handle, - m_impl_offset.span() * Array_N, alloc_name) - : functor_type((pointer_type)m_impl_handle, - m_impl_offset.span() * Array_N, alloc_name); - -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) - if (false) { - // Make sure the destroy functor gets instantiated. - // This avoids "cudaErrorInvalidDeviceFunction"-type errors. - functor.destroy_shared_allocation(); - } -#endif - - // Only initialize if the allocation is non-zero. - // May be zero if one of the dimensions is zero. - if constexpr (alloc_prop::initialize) - if (alloc_size) { - // Assume destruction is only required when construction is requested. - // The ViewValueFunctor has both value construction and destruction - // operators. - record->m_destroy = std::move(functor); - - // Construct values - record->m_destroy.construct_shared_allocation(); - } - - return record; - } -}; - -/** \brief Assign Array to non-Array */ - -template -class ViewMapping< - DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same::value && - std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value) && - std::is_same>::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>> { - public: - // Can only convert to View::array_type - - enum { - is_assignable_data_type = - std::is_same::value && - (DstTraits::rank == SrcTraits::rank + 1) - }; - enum { - is_assignable = - std::is_same::value && - std::is_same::value - }; - - using TrackType = Kokkos::Impl::SharedAllocationTracker; - using DstType = ViewMapping; - using SrcType = ViewMapping>; - - KOKKOS_INLINE_FUNCTION - static void assign(DstType &dst, const SrcType &src, - const TrackType & /*src_track*/) { - static_assert(is_assignable, "Can only convert to array_type"); - - using dst_offset_type = typename DstType::offset_type; - - // Array dimension becomes the last dimension. - // Arguments beyond the destination rank are ignored. - if (src.span_is_contiguous()) { // not padded - dst.m_impl_offset = dst_offset_type( - std::integral_constant(), - typename DstTraits::array_layout( - (0 < SrcType::Rank ? src.dimension_0() - : SrcTraits::value_type::size()), - (1 < SrcType::Rank ? src.dimension_1() - : SrcTraits::value_type::size()), - (2 < SrcType::Rank ? src.dimension_2() - : SrcTraits::value_type::size()), - (3 < SrcType::Rank ? src.dimension_3() - : SrcTraits::value_type::size()), - (4 < SrcType::Rank ? src.dimension_4() - : SrcTraits::value_type::size()), - (5 < SrcType::Rank ? src.dimension_5() - : SrcTraits::value_type::size()), - (6 < SrcType::Rank ? src.dimension_6() - : SrcTraits::value_type::size()), - (7 < SrcType::Rank ? src.dimension_7() - : SrcTraits::value_type::size()))); - } else { // is padded - using padded = std::integral_constant< - unsigned int, sizeof(typename SrcTraits::value_type::value_type)>; - - dst.m_impl_offset = dst_offset_type( - padded(), typename DstTraits::array_layout( - (0 < SrcType::Rank ? src.dimension_0() - : SrcTraits::value_type::size()), - (1 < SrcType::Rank ? src.dimension_1() - : SrcTraits::value_type::size()), - (2 < SrcType::Rank ? src.dimension_2() - : SrcTraits::value_type::size()), - (3 < SrcType::Rank ? src.dimension_3() - : SrcTraits::value_type::size()), - (4 < SrcType::Rank ? src.dimension_4() - : SrcTraits::value_type::size()), - (5 < SrcType::Rank ? src.dimension_5() - : SrcTraits::value_type::size()), - (6 < SrcType::Rank ? src.dimension_6() - : SrcTraits::value_type::size()), - (7 < SrcType::Rank ? src.dimension_7() - : SrcTraits::value_type::size()))); - } - - dst.m_impl_handle = src.m_impl_handle; - } -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ViewMapping< - std::enable_if_t<( - std::is_same>::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>, - SrcTraits, Args...> { - private: - static_assert(SrcTraits::rank == sizeof...(Args)); - - enum : bool { - R0 = is_integral_extent<0, Args...>::value, - R1 = is_integral_extent<1, Args...>::value, - R2 = is_integral_extent<2, Args...>::value, - R3 = is_integral_extent<3, Args...>::value, - R4 = is_integral_extent<4, Args...>::value, - R5 = is_integral_extent<5, Args...>::value, - R6 = is_integral_extent<6, Args...>::value, - R7 = is_integral_extent<7, Args...>::value - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) - }; - - // Whether right-most rank is a range. - enum { - R0_rev = - 0 == SrcTraits::rank - ? false - : (1 == SrcTraits::rank - ? R0 - : (2 == SrcTraits::rank - ? R1 - : (3 == SrcTraits::rank - ? R2 - : (4 == SrcTraits::rank - ? R3 - : (5 == SrcTraits::rank - ? R4 - : (6 == SrcTraits::rank - ? R5 - : (7 == SrcTraits::rank - ? R6 - : R7))))))) - }; - - // Subview's layout - using array_layout = - std::conditional_t<((rank == 0) || - (rank <= 2 && R0 && - std::is_same::value) || - (rank <= 2 && R0_rev && - std::is_same::value)), - typename SrcTraits::array_layout, - Kokkos::LayoutStride>; - - using value_type = typename SrcTraits::value_type; - - using data_type = std::conditional_t< - rank == 0, value_type, - std::conditional_t< - rank == 1, value_type *, - std::conditional_t< - rank == 2, value_type **, - std::conditional_t< - rank == 3, value_type ***, - std::conditional_t< - rank == 4, value_type ****, - std::conditional_t< - rank == 5, value_type *****, - std::conditional_t< - rank == 6, value_type ******, - std::conditional_t>>>>>>>; - - public: - using traits_type = Kokkos::ViewTraits; - - using type = - Kokkos::View; - - KOKKOS_INLINE_FUNCTION - static void assign(ViewMapping &dst, - ViewMapping const &src, Args... args) { - using DstType = ViewMapping; - - using dst_offset_type = typename DstType::offset_type; - using dst_handle_type = typename DstType::handle_type; - - const SubviewExtents extents(src.m_impl_offset.m_dim, - args...); - - dst.m_impl_offset = dst_offset_type(src.m_impl_offset, extents); - dst.m_impl_handle = dst_handle_type( - src.m_impl_handle + - src.m_impl_offset(extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6), extents.domain_offset(7))); - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp deleted file mode 100644 index 957717f973d..00000000000 --- a/lib/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp +++ /dev/null @@ -1,1425 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP -#define KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP - -#include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// View offset and mapping for tiled view's - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout< - Kokkos::Experimental::LayoutTiled> - : public std::true_type {}; - -template -struct is_array_layout_tiled : public std::false_type {}; - -template -struct is_array_layout_tiled> : public std::true_type { -}; // Last template parameter "true" meaning this currently only supports - // powers-of-two - -namespace Impl { - -template -struct ViewOffset< - Dimension, Layout, - std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) && - is_array_layout::value && - is_array_layout_tiled::value)>> { - public: - static constexpr Kokkos::Iterate outer_pattern = Layout::outer_pattern; - static constexpr Kokkos::Iterate inner_pattern = Layout::inner_pattern; - - static constexpr int VORank = Dimension::rank; - - static constexpr unsigned SHIFT_0 = - Kokkos::Impl::integral_power_of_two(Layout::N0); - static constexpr unsigned SHIFT_1 = - Kokkos::Impl::integral_power_of_two(Layout::N1); - static constexpr unsigned SHIFT_2 = - Kokkos::Impl::integral_power_of_two(Layout::N2); - static constexpr unsigned SHIFT_3 = - Kokkos::Impl::integral_power_of_two(Layout::N3); - static constexpr unsigned SHIFT_4 = - Kokkos::Impl::integral_power_of_two(Layout::N4); - static constexpr unsigned SHIFT_5 = - Kokkos::Impl::integral_power_of_two(Layout::N5); - static constexpr unsigned SHIFT_6 = - Kokkos::Impl::integral_power_of_two(Layout::N6); - static constexpr unsigned SHIFT_7 = - Kokkos::Impl::integral_power_of_two(Layout::N7); - static constexpr int MASK_0 = Layout::N0 - 1; - static constexpr int MASK_1 = Layout::N1 - 1; - static constexpr int MASK_2 = Layout::N2 - 1; - static constexpr int MASK_3 = Layout::N3 - 1; - static constexpr int MASK_4 = Layout::N4 - 1; - static constexpr int MASK_5 = Layout::N5 - 1; - static constexpr int MASK_6 = Layout::N6 - 1; - static constexpr int MASK_7 = Layout::N7 - 1; - - static constexpr unsigned SHIFT_2T = SHIFT_0 + SHIFT_1; - static constexpr unsigned SHIFT_3T = SHIFT_0 + SHIFT_1 + SHIFT_2; - static constexpr unsigned SHIFT_4T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3; - static constexpr unsigned SHIFT_5T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4; - static constexpr unsigned SHIFT_6T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5; - static constexpr unsigned SHIFT_7T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5 + SHIFT_6; - static constexpr unsigned SHIFT_8T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5 + SHIFT_6 + SHIFT_7; - - // Is an irregular layout that does not have uniform striding for each index. - using is_mapping_plugin = std::true_type; - using is_regular = std::false_type; - - using size_type = size_t; - using dimension_type = Dimension; - using array_layout = Layout; - - dimension_type m_dim; - size_type m_tile_N0; // Num tiles dim 0 - size_type m_tile_N1; - size_type m_tile_N2; - size_type m_tile_N3; - size_type m_tile_N4; - size_type m_tile_N5; - size_type m_tile_N6; - size_type m_tile_N7; - - //---------------------------------------- - -#define KOKKOS_IMPL_DEBUG_OUTPUT_CHECK 0 - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, - I1 const& i1) const { - auto tile_offset = - (outer_pattern == (Kokkos::Iterate::Left)) - ? (((i0 >> SHIFT_0) + m_tile_N0 * ((i1 >> SHIFT_1))) << SHIFT_2T) - : (((m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1))) << SHIFT_2T); - // ( num_tiles[1] * ti0 + ti1 ) * FTD - - auto local_offset = (inner_pattern == (Kokkos::Iterate::Left)) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0)) - : (((i0 & MASK_0) << SHIFT_1) + (i1 & MASK_1)); - // ( tile_dim[1] * li0 + li1 ) - -#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK - std::cout << "Am I Outer Left? " - << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "Am I Inner Left? " - << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "i0 = " << i0 << " i1 = " << i1 - << "\ntilei0 = " << (i0 >> SHIFT_0) - << " tilei1 = " << (i1 >> SHIFT_1) - << "locali0 = " << (i0 & MASK_0) - << "\nlocali1 = " << (i1 & MASK_1) << std::endl; -#endif - - return tile_offset + local_offset; - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * ((i1 >> SHIFT_1) + m_tile_N1 * (i2 >> SHIFT_2))) - << SHIFT_3T) - : ((m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) - << SHIFT_3T); - - auto local_offset = (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1))) - : (((i0 & MASK_0) << (SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_2)) + (i2 & MASK_2)); - -#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK - std::cout << "Am I Outer Left? " - << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "Am I Inner Left? " - << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "i0 = " << i0 << " i1 = " << i1 << " i2 = " << i2 - << "\ntilei0 = " << (i0 >> SHIFT_0) - << " tilei1 = " << (i1 >> SHIFT_1) - << " tilei2 = " << (i2 >> SHIFT_2) - << "\nlocali0 = " << (i0 & MASK_0) - << "locali1 = " << (i1 & MASK_1) << "locali2 = " << (i2 & MASK_2) - << std::endl; -#endif - - return tile_offset + local_offset; - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, - I3 const& i3) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * ((i1 >> SHIFT_1) + - m_tile_N1 * ((i2 >> SHIFT_2) + - m_tile_N2 * (i3 >> SHIFT_3)))) - << SHIFT_4T) - : ((m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) - << SHIFT_4T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2))) - : (((i0 & MASK_0) << (SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_3)) + (i3 & MASK_3)); - - return tile_offset + local_offset; - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * ((i2 >> SHIFT_2) + - m_tile_N2 * ((i3 >> SHIFT_3) + - m_tile_N3 * (i4 >> SHIFT_4))))) - << SHIFT_5T) - : ((m_tile_N4 * - (m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) - << SHIFT_5T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3))) - : (((i0 & MASK_0) << (SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_4)) + (i4 & MASK_4)); - - return tile_offset + local_offset; - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, - I5 const& i5) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * ((i4 >> SHIFT_4) + - m_tile_N4 * (i5 >> SHIFT_5)))))) - << SHIFT_6T) - : ((m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) - << SHIFT_6T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4))) - : (((i0 & MASK_0) - << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_5)) + (i5 & MASK_5)); - - return tile_offset + local_offset; - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, I5 const& i5, - I6 const& i6) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * - ((i4 >> SHIFT_4) + - m_tile_N4 * - ((i5 >> SHIFT_5) + - m_tile_N5 * (i6 >> SHIFT_6))))))) - << SHIFT_7T) - : ((m_tile_N6 * - (m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) + - (i6 >> SHIFT_6)) - << SHIFT_7T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + - ((i6 & MASK_6) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5))) - : (((i0 & MASK_0) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + - SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) - << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_6 + SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_6 + SHIFT_5)) + - ((i5 & MASK_5) << (SHIFT_6)) + (i6 & MASK_6)); - - return tile_offset + local_offset; - } - - // Rank 8 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, I5 const& i5, - I6 const& i6, - I7 const& i7) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * - ((i4 >> SHIFT_4) + - m_tile_N4 * - ((i5 >> SHIFT_5) + - m_tile_N5 * - ((i6 >> SHIFT_6) + - m_tile_N6 * (i7 >> SHIFT_7)))))))) - << SHIFT_8T) - : ((m_tile_N7 * - (m_tile_N6 * - (m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * - (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) + - (i6 >> SHIFT_6)) + - (i7 >> SHIFT_7)) - << SHIFT_8T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + - ((i6 & MASK_6) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5)) + - ((i7 & MASK_7) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5 + SHIFT_6))) - : (((i0 & MASK_0) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + - SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + - SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) - << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_7 + SHIFT_6 + SHIFT_5)) + - ((i5 & MASK_5) << (SHIFT_7 + SHIFT_6)) + - ((i6 & MASK_6) << (SHIFT_7)) + (i7 & MASK_7)); - - return tile_offset + local_offset; - } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { - return array_layout((VORank > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (VORank > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (VORank > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (VORank > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (VORank > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (VORank > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (VORank > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (VORank > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); - } - - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { - return m_dim.N0; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { - return m_dim.N1; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { - return m_dim.N2; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { - return m_dim.N3; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { - return m_dim.N4; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { - return m_dim.N5; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { - return m_dim.N6; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { - return m_dim.N7; - } - - KOKKOS_INLINE_FUNCTION constexpr size_type size() const { - return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * - m_dim.N6 * m_dim.N7; - } - - // Strides are meaningless due to irregularity - KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 0; } - - // Stride with [ rank ] value is the total length - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 0; - if (0 < dimension_type::rank) { - s[1] = 0; - } - if (1 < dimension_type::rank) { - s[2] = 0; - } - if (2 < dimension_type::rank) { - s[3] = 0; - } - if (3 < dimension_type::rank) { - s[4] = 0; - } - if (4 < dimension_type::rank) { - s[5] = 0; - } - if (5 < dimension_type::rank) { - s[6] = 0; - } - if (6 < dimension_type::rank) { - s[7] = 0; - } - if (7 < dimension_type::rank) { - s[8] = 0; - } - } - - KOKKOS_INLINE_FUNCTION constexpr size_type span() const { - // Rank2: ( NumTile0 * ( NumTile1 ) ) * TileSize, etc - return (VORank == 2) - ? (m_tile_N0 * m_tile_N1) << SHIFT_2T - : (VORank == 3) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2) << SHIFT_3T - : (VORank == 4) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * m_tile_N3) - << SHIFT_4T - : (VORank == 5) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * - m_tile_N3 * m_tile_N4) - << SHIFT_5T - : (VORank == 6) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * - m_tile_N3 * m_tile_N4 * m_tile_N5) - << SHIFT_6T - : (VORank == 7) - ? (m_tile_N0 * m_tile_N1 * - m_tile_N2 * m_tile_N3 * - m_tile_N4 * m_tile_N5 * - m_tile_N6) - << SHIFT_7T - : (m_tile_N0 * m_tile_N1 * - m_tile_N2 * m_tile_N3 * - m_tile_N4 * m_tile_N5 * - m_tile_N6 * m_tile_N7) - << SHIFT_8T; - } - - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return true; - } - - //---------------------------------------- -#ifdef KOKKOS_IMPL_WINDOWS_CUDA - KOKKOS_FUNCTION ViewOffset() {} - KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { - m_dim = src.m_dim; - m_tile_N0 = src.m_tile_N0; - m_tile_N1 = src.m_tile_N1; - m_tile_N2 = src.m_tile_N2; - m_tile_N3 = src.m_tile_N3; - m_tile_N4 = src.m_tile_N4; - m_tile_N5 = src.m_tile_N5; - m_tile_N6 = src.m_tile_N6; - m_tile_N7 = src.m_tile_N7; - } - KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { - m_dim = src.m_dim; - m_tile_N0 = src.m_tile_N0; - m_tile_N1 = src.m_tile_N1; - m_tile_N2 = src.m_tile_N2; - m_tile_N3 = src.m_tile_N3; - m_tile_N4 = src.m_tile_N4; - m_tile_N5 = src.m_tile_N5; - m_tile_N6 = src.m_tile_N6; - m_tile_N7 = src.m_tile_N7; - return *this; - } -#else - KOKKOS_DEFAULTED_FUNCTION ~ViewOffset() = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset() = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset(const ViewOffset&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset& operator=(const ViewOffset&) = default; -#endif - - template - KOKKOS_INLINE_FUNCTION constexpr ViewOffset( - std::integral_constant const&, - array_layout const arg_layout) - : m_dim(arg_layout.dimension[0], arg_layout.dimension[1], - arg_layout.dimension[2], arg_layout.dimension[3], - arg_layout.dimension[4], arg_layout.dimension[5], - arg_layout.dimension[6], arg_layout.dimension[7]), - m_tile_N0((arg_layout.dimension[0] + MASK_0) >> - SHIFT_0 /* number of tiles in first dimension */), - m_tile_N1((arg_layout.dimension[1] + MASK_1) >> SHIFT_1), - m_tile_N2((VORank > 2) ? (arg_layout.dimension[2] + MASK_2) >> SHIFT_2 - : 0), - m_tile_N3((VORank > 3) ? (arg_layout.dimension[3] + MASK_3) >> SHIFT_3 - : 0), - m_tile_N4((VORank > 4) ? (arg_layout.dimension[4] + MASK_4) >> SHIFT_4 - : 0), - m_tile_N5((VORank > 5) ? (arg_layout.dimension[5] + MASK_5) >> SHIFT_5 - : 0), - m_tile_N6((VORank > 6) ? (arg_layout.dimension[6] + MASK_6) >> SHIFT_6 - : 0), - m_tile_N7((VORank > 7) ? (arg_layout.dimension[7] + MASK_7) >> SHIFT_7 - : 0) {} -}; - -// FIXME Remove the out-of-class definitions when we require C++17 -#define KOKKOS_ITERATE_VIEW_OFFSET_ENABLE \ - std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) && \ - is_array_layout::value && \ - is_array_layout_tiled::value)> -template -constexpr Kokkos::Iterate ViewOffset< - Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::outer_pattern; -template -constexpr Kokkos::Iterate ViewOffset< - Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::inner_pattern; -template -constexpr int - ViewOffset::VORank; -template -constexpr unsigned - ViewOffset::SHIFT_0; -template -constexpr unsigned - ViewOffset::SHIFT_1; -template -constexpr unsigned - ViewOffset::SHIFT_2; -template -constexpr unsigned - ViewOffset::SHIFT_3; -template -constexpr unsigned - ViewOffset::SHIFT_4; -template -constexpr unsigned - ViewOffset::SHIFT_5; -template -constexpr unsigned - ViewOffset::SHIFT_6; -template -constexpr unsigned - ViewOffset::SHIFT_7; -template -constexpr int - ViewOffset::MASK_0; -template -constexpr int - ViewOffset::MASK_1; -template -constexpr int - ViewOffset::MASK_2; -template -constexpr int - ViewOffset::MASK_3; -template -constexpr int - ViewOffset::MASK_4; -template -constexpr int - ViewOffset::MASK_5; -template -constexpr int - ViewOffset::MASK_6; -template -constexpr int - ViewOffset::MASK_7; -template -constexpr unsigned - ViewOffset::SHIFT_2T; -template -constexpr unsigned - ViewOffset::SHIFT_3T; -template -constexpr unsigned - ViewOffset::SHIFT_4T; -template -constexpr unsigned - ViewOffset::SHIFT_5T; -template -constexpr unsigned - ViewOffset::SHIFT_6T; -template -constexpr unsigned - ViewOffset::SHIFT_7T; -template -constexpr unsigned - ViewOffset::SHIFT_8T; -#undef KOKKOS_ITERATE_VIEW_OFFSET_ENABLE - -//---------------------------------------- - -// ViewMapping assign method needed in order to return a 'subview' tile as a -// proper View The outer iteration pattern determines the mapping of the pointer -// offset to the beginning of requested tile The inner iteration pattern is -// needed for the layout of the tile's View to be returned Rank 2 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T**, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left ? ((i_tile0 + src.m_impl_offset.m_tile_N0 * i_tile1) - << src_offset_type::SHIFT_2T) - : ((src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) - << src_offset_type::SHIFT_2T)) // offset to start - // of the tile - ), - dst_offset_type()); - } -}; - -// Rank 3 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T***, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + src.m_impl_offset.m_tile_N1 * i_tile2)) - << src_offset_type::SHIFT_3T) - : ((src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) + - i_tile2) - << src_offset_type::SHIFT_3T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 4 -template -class ViewMapping< - std::enable_if_t<(N4 == 0 && N5 == 0 && N6 == 0 && N7 == 0)> // void - , - Kokkos::ViewTraits< - T****, - Kokkos::Experimental::LayoutTiled, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + src.m_impl_offset.m_tile_N1 * - (i_tile2 + src.m_impl_offset.m_tile_N2 * - i_tile3))) - << src_offset_type::SHIFT_4T) - : ((src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) - << src_offset_type::SHIFT_4T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 5 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T*****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * i_tile4)))) - << src_offset_type::SHIFT_5T) - : ((src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) - << src_offset_type::SHIFT_5T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 6 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4, iType5> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = - Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + src.m_impl_offset.m_tile_N4 * - i_tile5))))) - << src_offset_type::SHIFT_6T) - : ((src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) - << src_offset_type::SHIFT_6T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 7 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T*******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4, iType5, iType6> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = - Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5, const iType6 i_tile6) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + - src.m_impl_offset.m_tile_N4 * - (i_tile5 + - src.m_impl_offset.m_tile_N5 * - i_tile6)))))) - << src_offset_type::SHIFT_7T) - : ((src.m_impl_offset.m_tile_N6 * - (src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * - i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) + - i_tile6) - << src_offset_type::SHIFT_7T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 8 -template -class ViewMapping< - std::enable_if_t<(N0 != 0 && N1 != 0 && N2 != 0 && N3 != 0 && N4 != 0 && - N5 != 0 && N6 != 0 && N7 != 0)> // void - , - Kokkos::ViewTraits< - T********, - Kokkos::Experimental::LayoutTiled, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = - Kokkos::ViewTraits; - using type = - Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5, const iType6 i_tile6, const iType7 i_tile7) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + - src.m_impl_offset.m_tile_N4 * - (i_tile5 + - src.m_impl_offset.m_tile_N5 * - (i_tile6 + - src.m_impl_offset.m_tile_N6 * - i_tile7))))))) - << src_offset_type::SHIFT_8T) - : ((src.m_impl_offset.m_tile_N7 * - (src.m_impl_offset.m_tile_N6 * - (src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * - i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) + - i_tile6) + - i_tile7) - << src_offset_type::SHIFT_8T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------- - -namespace Kokkos { - -// Rank 2 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T**, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View(src, SrcLayout(), i_tile0, - i_tile1); -} - -// Rank 3 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T***, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2); -} - -// Rank 4 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3); -} - -// Rank 5 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T*****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4); -} - -// Rank 6 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5); -} - -// Rank 7 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T*******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5, - const size_t i_tile6) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, - i_tile6); -} - -// Rank 8 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T********, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5, - const size_t i_tile6, const size_t i_tile7) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, - i_tile6, i_tile7); -} - -} /* namespace Kokkos */ -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_EXPERIENTAL_VIEWLAYOUTTILE_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index 3217c76e380..8919dccdb7a 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -17,6 +17,7 @@ #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP #define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP +#include #include #include @@ -34,6 +35,7 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -647,34 +649,60 @@ struct ViewOffset< m_dim.N5 * m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // FIXME: The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_dim.N0; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_dim.N0; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements Stride with [ rank ] value is + // the total length + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -935,34 +963,59 @@ struct ViewOffset< m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_stride; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_stride; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1286,42 +1339,58 @@ struct ViewOffset< m_dim.N1; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; n *= m_dim.N1; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = n; } - s[dimension_type::rank] = n * m_dim.N0; + return n * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1573,41 +1642,57 @@ struct ViewOffset< return m_stride; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride; } - s[dimension_type::rank] = m_stride * m_dim.N0; + return m_stride * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2133,34 +2218,50 @@ struct ViewOffset { return m_stride.S7; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - if (0 < dimension_type::rank) { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride.S0; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = m_stride.S1; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = m_stride.S2; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = m_stride.S3; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = m_stride.S4; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = m_stride.S5; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = m_stride.S6; } - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = m_stride.S7; } - s[dimension_type::rank] = span(); + return span(); + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2428,288 +2529,6 @@ struct ViewDataHandle< namespace Kokkos { namespace Impl { - -template -inline bool is_zero_byte(const T& t) { - using comparison_type = std::conditional_t< - sizeof(T) % sizeof(long long int) == 0, long long int, - std::conditional_t< - sizeof(T) % sizeof(long int) == 0, long int, - std::conditional_t< - sizeof(T) % sizeof(int) == 0, int, - std::conditional_t>>>; - const auto* const ptr = reinterpret_cast(&t); - for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i) - if (ptr[i] != 0) return false; - return true; -} - -//---------------------------------------------------------------------------- - -/* - * The construction, assignment to default, and destruction - * are merged into a single functor. - * Primarily to work around an unresolved CUDA back-end bug - * that would lose the destruction cuda device function when - * called from the shared memory tracking destruction. - * Secondarily to have two fewer partial specializations. - */ -template ::value> -struct ViewValueFunctor; - -template -struct ViewValueFunctor { - using ExecSpace = typename DeviceType::execution_space; - - struct DestroyTag {}; - struct ConstructTag {}; - - ExecSpace space; - ValueType* ptr; - size_t n; - std::string name; - bool default_exec_space; - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - operator()(ConstructTag const&, const size_t i) const { - new (ptr + i) ValueType(); - } - - KOKKOS_INLINE_FUNCTION void operator()(DestroyTag const&, - const size_t i) const { - (ptr + i)->~ValueType(); - } - - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; - - ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, - size_t const arg_n, std::string arg_name) - : space(arg_space), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(false) { - functor_instantiate_workaround(); - } - - ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, - std::string arg_name) - : space(ExecSpace{}), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(true) { - functor_instantiate_workaround(); - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_dispatch() { - ValueType value{}; -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_dispatch() { - parallel_for_implementation(); - } - - template - void parallel_for_implementation() { - using PolicyType = - Kokkos::RangePolicy, Tag>; - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - const std::string functor_name = - (std::is_same_v - ? "Kokkos::View::destruction [" + name + "]" - : "Kokkos::View::initialization [" + name + "]"); - Kokkos::Profiling::beginParallelFor( - functor_name, Kokkos::Profiling::Experimental::device_id(space), - &kpID); - } - -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } -#endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } - - void construct_shared_allocation() { construct_dispatch(); } - - void destroy_shared_allocation() { - parallel_for_implementation(); - } - - // This function is to ensure that the functor with DestroyTag is instantiated - // This is a workaround to avoid "cudaErrorInvalidDeviceFunction" error later - // when the function is queried with cudaFuncGetAttributes - void functor_instantiate_workaround() { -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) - if (false) { - parallel_for_implementation(); - } -#endif - } -}; - -template -struct ViewValueFunctor { - using ExecSpace = typename DeviceType::execution_space; - using PolicyType = Kokkos::RangePolicy>; - - ExecSpace space; - ValueType* ptr; - size_t n; - std::string name; - bool default_exec_space; - - KOKKOS_INLINE_FUNCTION - void operator()(const size_t i) const { ptr[i] = ValueType(); } - - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; - - ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, - size_t const arg_n, std::string arg_name) - : space(arg_space), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(false) {} - - ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, - std::string arg_name) - : space(ExecSpace{}), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(true) {} - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_shared_allocation() { - // Shortcut for zero initialization -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - ValueType value{}; - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_shared_allocation() { - parallel_for_implementation(); - } - - void parallel_for_implementation() { - PolicyType policy(0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } -#endif - const Kokkos::Impl::ParallelFor closure( - *this, PolicyType(0, n)); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } - - void destroy_shared_allocation() {} -}; - //---------------------------------------------------------------------------- /** \brief View mapping for non-specialized data type and standard layout */ template @@ -2814,11 +2633,24 @@ class ViewMapping< return m_impl_offset.stride_7(); } + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements template KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { m_impl_offset.stride(s); } + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + template + KOKKOS_INLINE_FUNCTION iType stride_fill(iType* const s) const { + return m_impl_offset.stride_fill(s); + } + //---------------------------------------- // Range span @@ -3360,7 +3192,7 @@ struct SubViewDataTypeImpl> { }; /* for integral args, subview doesn't have that dimension */ -template struct SubViewDataTypeImpl< std::enable_if_t>::value>, @@ -3369,7 +3201,7 @@ struct SubViewDataTypeImpl< Kokkos::Experimental::Extents, Args...> {}; /* for ALL slice, subview has the same dimension */ -template +template struct SubViewDataTypeImpl, Kokkos::ALL_t, Args...> @@ -3380,7 +3212,7 @@ struct SubViewDataTypeImpl struct SubViewDataTypeImpl< std::enable_if_t::value>, ValueType, diff --git a/lib/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp b/lib/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp index 1130485e841..b2faccc5270 100644 --- a/lib/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp +++ b/lib/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -56,6 +56,8 @@ #define KOKKOS_LAMBDA [=] __host__ __device__ #define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__ +#define KOKKOS_DEDUCTION_GUIDE __host__ __device__ + #define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ #define KOKKOS_IMPL_FORCEINLINE __forceinline__ #define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline diff --git a/lib/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp b/lib/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp index 7b018661070..a3c5000b338 100644 --- a/lib/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp +++ b/lib/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp @@ -27,6 +27,8 @@ #define KOKKOS_LAMBDA [=] __host__ __device__ #define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__ +#define KOKKOS_DEDUCTION_GUIDE __host__ __device__ + #define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ #define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline #define KOKKOS_IMPL_FUNCTION __device__ __host__ diff --git a/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp index 30f6fa2ad23..b117d75acb9 100644 --- a/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -45,4 +45,21 @@ #define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() #endif +// FIXME_SYCL Use type directly once it has stabilized in SYCL. +namespace Kokkos::Impl { +#ifndef SYCL_EXT_INTEL_USM_ADDRESS_SPACES +#error SYCL_EXT_INTEL_USM_ADDRESS_SPACES undefined! +#elif SYCL_EXT_INTEL_USM_ADDRESS_SPACES >= 2 +template +using sycl_device_ptr = sycl::ext::intel::device_ptr; +template +using sycl_host_ptr = sycl::ext::intel::host_ptr; +#else +template +using sycl_device_ptr = sycl::device_ptr; +template +using sycl_host_ptr = sycl::host_ptr; +#endif +} // namespace Kokkos::Impl + #endif diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt index 6dfb7505c5d..f8215818727 100644 --- a/lib/kokkos/core/unit_test/CMakeLists.txt +++ b/lib/kokkos/core/unit_test/CMakeLists.txt @@ -93,6 +93,9 @@ SET(COMPILE_ONLY_SOURCES TestViewTypeTraits.cpp TestTypeList.cpp TestMDRangePolicyCTAD.cpp + TestTeamPolicyCTAD.cpp + TestTeamMDRangePolicyCTAD.cpp + TestNestedReducerCTAD.cpp view/TestExtentsDatatypeConversion.cpp ) @@ -105,6 +108,9 @@ endif() IF(KOKKOS_HAS_TRILINOS) LIST(REMOVE_ITEM COMPILE_ONLY_SOURCES TestInterOp.cpp) ENDIF() +if(Kokkos_ENABLE_OPENMPTARGET) + list(REMOVE_ITEM COMPILE_ONLY_SOURCES TestNestedReducerCTAD.cpp) +endif() KOKKOS_ADD_EXECUTABLE( CoreTestCompileOnly SOURCES @@ -148,8 +154,10 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Crs DeepCopyAlignment ExecSpacePartitioning + ExecSpaceThreadSafety ExecutionSpace FunctorAnalysis + Graph HostSharedPtr HostSharedPtrAccessOnDevice Init @@ -173,7 +181,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) endforeach() set(${Tag}_SOURCES1B) - foreach(Name + set(${Tag}_TESTNAMES1B MDRange_a MDRange_b MDRange_c @@ -184,6 +192,8 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDRangePolicyConstructors MDRangeReduce MDSpan + MDSpanAtomicAccessor + MDSpanConversion MinMaxClamp NumericTraits OccupancyControlTrait @@ -203,8 +213,19 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Reductions Reductions_DeviceView SharedAlloc + SpaceAwareAccessorAccessViolation + SpaceAwareAccessor Swap ) + IF (NOT Kokkos_ENABLE_IMPL_MDSPAN) + LIST(REMOVE_ITEM ${Tag}_TESTNAMES1B + MDSpanAtomicAccessor + MDSpanConversion + SpaceAwareAccessorAccessViolation + SpaceAwareAccessor + ) + ENDIF() + foreach(Name IN LISTS ${Tag}_TESTNAMES1B) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. @@ -217,7 +238,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) endforeach() SET(${Tag}_SOURCES2A) - foreach(Name + SET(${TAG}_TESTNAMES2A TeamBasic TeamCombinedReducers TeamMDRange @@ -234,8 +255,10 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewAPI_c ViewAPI_d ViewAPI_e + ViewBadAlloc ViewCopy_a ViewCopy_b + ViewCopy_c ViewCtorDimMatch ViewEmptyRuntimeUnmanaged ViewHooks @@ -245,11 +268,21 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewMapping_subview ViewMemoryAccessViolation ViewOfClass + ViewOfViews ViewOutOfBoundsAccess ViewResize WorkGraph WithoutInitializing ) + # Workaround to internal compiler error with intel classic compilers + # when using -no-ip flag in ViewCopy_c + # See issue: + IF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + LIST(REMOVE_ITEM ${Tag}_TESTNAMES2A + ViewCopy_c + ) + endif() + foreach(Name IN LISTS ${Tag}_TESTNAMES2A) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. @@ -353,6 +386,7 @@ foreach(PairDeviceSpace HIP-HostPinned;HIP-Managed;Cuda-HostPinned;Cuda-UVM;SYCL ViewAPI_e ViewCopy_a ViewCopy_b + ViewCopy_c ViewMapping_a ViewMapping_b ViewMapping_subview @@ -648,12 +682,6 @@ if(Kokkos_ENABLE_SERIAL) UnitTestMainInit.cpp ${Serial_SOURCES2} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SerialGraph - SOURCES - UnitTestMainInit.cpp - serial/TestSerial_Graph.cpp - ) endif() if(Kokkos_ENABLE_THREADS) @@ -681,12 +709,6 @@ if (Kokkos_ENABLE_OPENMP) UnitTestMain.cpp openmp/TestOpenMP_InterOp.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_OpenMPGraph - SOURCES - UnitTestMainInit.cpp - openmp/TestOpenMP_Graph.cpp - ) endif() if(Kokkos_ENABLE_HPX) @@ -794,12 +816,6 @@ if(Kokkos_ENABLE_CUDA) UnitTestMainInit.cpp cuda/TestCuda_InterOp_StreamsMultiGPU.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CudaGraph - SOURCES - UnitTestMainInit.cpp - cuda/TestCuda_Graph.cpp - ) endif() if(Kokkos_ENABLE_HIP) @@ -827,12 +843,6 @@ if(Kokkos_ENABLE_HIP) UnitTestMain.cpp hip/TestHIP_InterOp_Streams.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HIPGraph - SOURCES - UnitTestMainInit.cpp - hip/TestHIP_Graph.cpp - ) endif() if(Kokkos_ENABLE_SYCL) @@ -902,15 +912,21 @@ if(Kokkos_ENABLE_SYCL) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_SYCLInterOpInit_Context SOURCES - UnitTestMainInit.cpp + UnitTestMainInit.cpp sycl/TestSYCL_InterOp_Init_Context.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_SYCLInterOpStreams SOURCES - UnitTestMain.cpp + UnitTestMain.cpp sycl/TestSYCL_InterOp_Streams.cpp ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + CoreUnitTest_SYCLInterOpStreamsMultiGPU + SOURCES + UnitTestMainInit.cpp + sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp + ) endif() SET(DEFAULT_DEVICE_SOURCES @@ -993,6 +1009,13 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST( UnitTest_PushFinalizeHook.cpp ) +KOKKOS_ADD_EXECUTABLE_AND_TEST( + CoreUnitTest_ScopeGuard + SOURCES + UnitTestMain.cpp + UnitTest_ScopeGuard.cpp +) + # This test is intended for development and debugging by putting code # into TestDefaultDeviceDevelop.cpp. By default its empty. KOKKOS_ADD_EXECUTABLE_AND_TEST( @@ -1002,23 +1025,35 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST( default/TestDefaultDeviceDevelop.cpp ) -# This test is special, because it passes exactly when it prints the -# message "PASSED: I am the custom std::terminate handler.", AND calls -# std::terminate. This means that we can't use -# KOKKOS_ADD_EXECUTABLE_AND_TEST. See GitHub issue #2147. - -KOKKOS_ADD_TEST_EXECUTABLE( push_finalize_hook_terminate - SOURCES UnitTest_PushFinalizeHook_terminate.cpp -) +# With MSVC, the terminate handler is called and prints the message but the +# program does not seem to exit and we get a timeout with ctest. +if (NOT WIN32) + # This test is special, because it passes exactly when it prints the + # message "PASSED: I am the custom std::terminate handler.", AND calls + # std::terminate. This means that we can't use + # KOKKOS_ADD_EXECUTABLE_AND_TEST. See GitHub issue #2147. + KOKKOS_ADD_TEST_EXECUTABLE( + CoreUnitTest_PushFinalizeHookTerminate + SOURCES UnitTest_PushFinalizeHook_terminate.cpp + ) + add_test( + NAME Kokkos_CoreUnitTest_PushFinalizeHookTerminateRegex + COMMAND ${CMAKE_COMMAND} -E env $ + ) + set_property( + TEST Kokkos_CoreUnitTest_PushFinalizeHookTerminateRegex + PROPERTY PASS_REGULAR_EXPRESSION "PASSED: I am the custom std::terminate handler." + ) + add_test( + NAME Kokkos_CoreUnitTest_PushFinalizeHookTerminateFails + COMMAND ${CMAKE_COMMAND} -E env $ + ) + set_property( + TEST Kokkos_CoreUnitTest_PushFinalizeHookTerminateFails + PROPERTY WILL_FAIL TRUE + ) +endif() -KOKKOS_ADD_ADVANCED_TEST( CoreUnitTest_PushFinalizeHook_terminate - TEST_0 - EXEC push_finalize_hook_terminate - NUM_MPI_PROCS 1 - PASS_REGULAR_EXPRESSION - "PASSED: I am the custom std::terminate handler." - ALWAYS_FAIL_ON_ZERO_RETURN -) if(KOKKOS_ENABLE_TUNING) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_TuningBuiltins @@ -1243,7 +1278,7 @@ if (NOT KOKKOS_HAS_TRILINOS) ) add_test( NAME Kokkos_CoreUnitTest_DeviceAndThreads - COMMAND ${Python3_EXECUTABLE} -m unittest -v $/ + COMMAND ${Python3_EXECUTABLE} $/ -v ) endif() endif() diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile index 202809d3fc9..a4d65687e54 100644 --- a/lib/kokkos/core/unit_test/Makefile +++ b/lib/kokkos/core/unit_test/Makefile @@ -62,7 +62,7 @@ else STACK_TRACE_TERMINATE_FILTER := endif -TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize +TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewCopy_c ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ @@ -73,7 +73,7 @@ tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ ) \ ) -GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewMapping_a ViewMapping_b ViewMapping_subview +GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewCopy_c ViewMapping_a ViewMapping_b ViewMapping_subview SUBVIEW_TESTS = SubView_a SubView_b SubView_c01 SubView_c02 SubView_c03 SubView_c04 SubView_c05 SubView_c06 SubView_c07 SubView_c08 SubView_c09 SubView_c10 SubView_c11 SubView_c12 SubView_c13 @@ -110,14 +110,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_CUDA += TestCuda_Init.o OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o OBJ_CUDA += TestCuda_RangePolicy.o TestCuda_RangePolicyRequire.o - OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o TestCuda_ViewAPI_e.o TestCuda_ViewCopy_a.o TestCuda_ViewCopy_b.o + OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o TestCuda_ViewAPI_e.o TestCuda_ViewCopy_a.o TestCuda_ViewCopy_b.o TestCuda_ViewCopy_c.o OBJ_CUDA += TestCuda_DeepCopyAlignment.o OBJ_CUDA += TestCuda_ViewMapping_a.o TestCuda_ViewMapping_b.o TestCuda_ViewMapping_subview.o TestCuda_ViewResize.o TestCuda_ViewLayoutStrideAssignment.o OBJ_CUDA += TestCudaUVM_ViewAPI_a.o TestCudaUVM_ViewAPI_b.o TestCudaUVM_ViewAPI_c.o TestCudaUVM_ViewAPI_d.o TestCudaUVM_ViewAPI_e.o - OBJ_CUDA += TestCudaUVM_ViewCopy_a.o TestCudaUVM_ViewCopy_b.o + OBJ_CUDA += TestCudaUVM_ViewCopy_a.o TestCudaUVM_ViewCopy_b.o TestCudaUVM_ViewCopy_c.o OBJ_CUDA += TestCudaUVM_ViewMapping_a.o TestCudaUVM_ViewMapping_b.o TestCudaUVM_ViewMapping_subview.o OBJ_CUDA += TestCudaHostPinned_ViewAPI_a.o TestCudaHostPinned_ViewAPI_b.o TestCudaHostPinned_ViewAPI_c.o TestCudaHostPinned_ViewAPI_d.o TestCudaHostPinned_ViewAPI_e.o - OBJ_CUDA += TestCudaHostPinned_ViewCopy_a.o TestCudaHostPinned_ViewCopy_b.o + OBJ_CUDA += TestCudaHostPinned_ViewCopy_a.o TestCudaHostPinned_ViewCopy_b.o TestCudaHostPinned_ViewCopy_c.o OBJ_CUDA += TestCudaHostPinned_ViewMapping_a.o TestCudaHostPinned_ViewMapping_b.o TestCudaHostPinned_ViewMapping_subview.o OBJ_CUDA += TestCuda_View_64bit.o OBJ_CUDA += TestCuda_ViewOfClass.o @@ -162,7 +162,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) OBJ_THREADS += TestThreads_RangePolicy.o TestThreads_RangePolicyRequire.o OBJ_THREADS += TestThreads_View_64bit.o OBJ_THREADS += TestThreads_ViewAPI_a.o TestThreads_ViewAPI_b.o TestThreads_ViewAPI_c.o TestThreads_ViewAPI_d.o TestThreads_ViewAPI_e.o - OBJ_THREADS += TestThreads_ViewCopy_a.o TestThreads_ViewCopy_b.o + OBJ_THREADS += TestThreads_ViewCopy_a.o TestThreads_ViewCopy_b.o TestThreads_ViewCopy_c.o OBJ_THREADS += TestThreads_DeepCopyAlignment.o OBJ_THREADS += TestThreads_ViewMapping_a.o TestThreads_ViewMapping_b.o TestThreads_ViewMapping_subview.o TestThreads_ViewResize.o TestThreads_ViewLayoutStrideAssignment.o OBJ_THREADS += TestThreads_ViewOfClass.o @@ -198,7 +198,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) OBJ_OPENMP += TestOpenMP_RangePolicy.o TestOpenMP_RangePolicyRequire.o OBJ_OPENMP += TestOpenMP_View_64bit.o OBJ_OPENMP += TestOpenMP_ViewAPI_a.o TestOpenMP_ViewAPI_b.o TestOpenMP_ViewAPI_c.o TestOpenMP_ViewAPI_d.o TestOpenMP_ViewAPI_e.o - OBJ_OPENMP += TestOpenMP_DeepCopyAlignment.o TestOpenMP_ViewCopy_a.o TestOpenMP_ViewCopy_b.o + OBJ_OPENMP += TestOpenMP_DeepCopyAlignment.o TestOpenMP_ViewCopy_a.o TestOpenMP_ViewCopy_b.o TestOpenMP_ViewCopy_c.o OBJ_OPENMP += TestOpenMP_ViewMapping_a.o TestOpenMP_ViewMapping_b.o TestOpenMP_ViewMapping_subview.o TestOpenMP_ViewResize.o TestOpenMP_ViewLayoutStrideAssignment.o OBJ_OPENMP += TestOpenMP_ViewOfClass.o OBJ_OPENMP += TestOpenMP_SubView_a.o TestOpenMP_SubView_b.o @@ -237,7 +237,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) #OBJ_OPENMPTARGET += TestOpenMPTarget_SharedAlloc.o OBJ_OPENMPTARGET += TestOpenMPTarget_RangePolicy.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_a.o TestOpenMPTarget_ViewAPI_b.o TestOpenMPTarget_ViewAPI_c.o TestOpenMPTarget_ViewAPI_d.o #Some commented out code - #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_e.o TestOpenMPTarget_ViewCopy_a.o TestOpenMPTarget_ViewCopy_b.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_e.o TestOpenMPTarget_ViewCopy_a.o TestOpenMPTarget_ViewCopy_b.o TestOpenMPTarget_ViewCopy_c.o OBJ_OPENMPTARGET += TestOpenMPTarget_DeepCopyAlignment.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_a.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_b.o @@ -292,7 +292,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) OBJ_HIP += TestHIP_Memory_Requirements.o OBJ_HIP += TestHIP_ParallelScanRangePolicy.o OBJ_HIP += TestHIPHostPinned_ViewAPI_a.o TestHIPHostPinned_ViewAPI_b.o TestHIPHostPinned_ViewAPI_c.o TestHIPHostPinned_ViewAPI_d.o TestHIPHostPinned_ViewAPI_e.o - OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o + OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o TestHIPHostPinned_ViewCopy_c.o OBJ_HIP += TestHIPHostPinned_ViewMapping_a.o TestHIPHostPinned_ViewMapping_b.o TestHIPHostPinned_ViewMapping_subview.o TARGETS += KokkosCore_UnitTest_HIP @@ -307,7 +307,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) OBJ_HPX += TestHPX_RangePolicy.o TestHPX_RangePolicyRequire.o OBJ_HPX += TestHPX_View_64bit.o OBJ_HPX += TestHPX_ViewAPI_a.o TestHPX_ViewAPI_b.o TestHPX_ViewAPI_c.o TestHPX_ViewAPI_d.o TestHPX_ViewAPI_e.o - OBJ_HPX += TestHPX_ViewCopy_a.o TestHPX_ViewCopy_b.o + OBJ_HPX += TestHPX_ViewCopy_a.o TestHPX_ViewCopy_b.o TestHPX_ViewCopy_c.o OBJ_HPX += TestHPX_ViewMapping_a.o TestHPX_ViewMapping_b.o TestHPX_ViewMapping_subview.o TestHPX_ViewResize.o OBJ_HPX += TestHPX_ViewOfClass.o OBJ_HPX += TestHPX_SubView_a.o TestHPX_SubView_b.o @@ -347,7 +347,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) OBJ_SERIAL += TestSerial_RangePolicy.o TestSerial_RangePolicyRequire.o OBJ_SERIAL += TestSerial_View_64bit.o OBJ_SERIAL += TestSerial_ViewAPI_a.o TestSerial_ViewAPI_b.o TestSerial_ViewAPI_c.o TestSerial_ViewAPI_d.o TestSerial_ViewAPI_e.o - OBJ_SERIAL += TestSerial_DeepCopyAlignment.o TestSerial_ViewCopy_a.o TestSerial_ViewCopy_b.o + OBJ_SERIAL += TestSerial_DeepCopyAlignment.o TestSerial_ViewCopy_a.o TestSerial_ViewCopy_b.o TestSerial_ViewCopy_c.o OBJ_SERIAL += TestSerial_ViewMapping_a.o TestSerial_ViewMapping_b.o TestSerial_ViewMapping_subview.o TestSerial_ViewResize.o TestSerial_ViewLayoutStrideAssignment.o OBJ_SERIAL += TestSerial_ViewOfClass.o OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp deleted file mode 100644 index f1316a7426a..00000000000 --- a/lib/kokkos/core/unit_test/TestAggregate.hpp +++ /dev/null @@ -1,108 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef TEST_AGGREGATE_HPP -#define TEST_AGGREGATE_HPP - -#include - -namespace Test { - -template -void TestViewAggregate() { - using value_type = Kokkos::Array; - using analysis_1d = - Kokkos::Impl::ViewDataAnalysis; - - static_assert( - std::is_same >::value); - - using a32_traits = Kokkos::ViewTraits; - using flat_traits = - Kokkos::ViewTraits; - - static_assert( - std::is_same >::value); - static_assert( - std::is_same::value); - static_assert(a32_traits::rank == 2); - static_assert(a32_traits::rank_dynamic == 2); - - static_assert(std::is_void::value); - static_assert(flat_traits::rank == 3); - static_assert(flat_traits::rank_dynamic == 2); - static_assert(flat_traits::dimension::N2 == 32); - - using a32_type = Kokkos::View **, DeviceType>; - using a32_flat_type = typename a32_type::array_type; - - static_assert(std::is_same::value); - static_assert(std::is_same::value); - static_assert(a32_type::rank == 2); - static_assert(a32_flat_type::rank == 3); - - a32_type x("test", 4, 5); - a32_flat_type y(x); - - ASSERT_EQ(x.extent(0), 4u); - ASSERT_EQ(x.extent(1), 5u); - ASSERT_EQ(y.extent(0), 4u); - ASSERT_EQ(y.extent(1), 5u); - ASSERT_EQ(y.extent(2), 32u); - - // Initialize arrays from brace-init-list as for std::array. - // - // Comment: Clang will issue the following warning if we don't use double - // braces here (one for initializing the Kokkos::Array and one for - // initializing the sub-aggreagate C-array data member), - // - // warning: suggest braces around initialization of subobject - // - // but single brace syntax would be valid as well. - Kokkos::Array aggregate_initialization_syntax_1 = {{1.41, 3.14}}; - ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[0], 1.41); - ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[1], 3.14); - - Kokkos::Array aggregate_initialization_syntax_2{ - {0, 1, 2}}; // since C++11 - for (int i = 0; i < 3; ++i) { - ASSERT_EQ(aggregate_initialization_syntax_2[i], i); - } - - // Note that this is a valid initialization. - Kokkos::Array initialized_with_one_argument_missing = {{255, 255}}; - for (int i = 0; i < 2; ++i) { - ASSERT_DOUBLE_EQ(initialized_with_one_argument_missing[i], 255); - } - // But the following line would not compile - // Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } }; - - // The code below must compile for zero-sized arrays. - using T = float; - - constexpr int N = 0; - Kokkos::Array a; - for (int i = 0; i < N; ++i) { - a[i] = T(); - } -} - -TEST(TEST_CATEGORY, view_aggregate) { TestViewAggregate(); } - -} // namespace Test - -#endif /* #ifndef TEST_AGGREGATE_HPP */ diff --git a/lib/kokkos/core/unit_test/TestArray.cpp b/lib/kokkos/core/unit_test/TestArray.cpp index 673d0036b71..cb713a17826 100644 --- a/lib/kokkos/core/unit_test/TestArray.cpp +++ b/lib/kokkos/core/unit_test/TestArray.cpp @@ -15,9 +15,19 @@ //@HEADER #include +#include namespace { +// nvcc errors on variables only used in static_asserts +// Passing those variables to this function should eliminate the warning +template +KOKKOS_FUNCTION constexpr void maybe_unused(Ts&&...) {} + +template +using equality_comparable = + decltype(std::declval() == std::declval()); + KOKKOS_FUNCTION constexpr bool test_array() { constexpr Kokkos::Array a{{1, 2}}; @@ -49,17 +59,6 @@ KOKKOS_FUNCTION constexpr bool test_array_structured_binding_support() { static_assert(test_array_structured_binding_support()); -template -KOKKOS_FUNCTION constexpr bool is_equal(L const& l, R const& r) { - if (std::size(l) != std::size(r)) return false; - - for (size_t i = 0; i != std::size(l); ++i) { - if (l[i] != r[i]) return false; - } - - return true; -} - // Disable ctad test for intel versions < 2021, see issue #6702 #if !defined(KOKKOS_COMPILER_INTEL) || KOKKOS_COMPILER_INTEL >= 2021 KOKKOS_FUNCTION constexpr bool test_array_ctad() { @@ -67,10 +66,180 @@ KOKKOS_FUNCTION constexpr bool test_array_ctad() { constexpr Kokkos::Array a{1, 2, 3, 5, x}; constexpr Kokkos::Array b{1, 2, 3, 5, x}; - return std::is_same_v && is_equal(a, b); + return std::is_same_v && a == b; } static_assert(test_array_ctad()); #endif +KOKKOS_FUNCTION constexpr bool test_array_aggregate_initialization() { + // Initialize arrays from brace-init-list as for std::array. + + Kokkos::Array aggregate_initialization_syntax_1 = {1.41f, 3.14f}; + if ((aggregate_initialization_syntax_1[0] != 1.41f) || + (aggregate_initialization_syntax_1[1] != 3.14f)) + return false; + + Kokkos::Array aggregate_initialization_syntax_2{ + {0, 1, 2}}; // since C++11 + if ((aggregate_initialization_syntax_2[0] != 0) || + (aggregate_initialization_syntax_2[1] != 1) || + (aggregate_initialization_syntax_2[2] != 2)) + return false; + + // Note that this is a valid initialization. + Kokkos::Array initialized_with_one_argument_missing = {{255, 255}}; + if ((initialized_with_one_argument_missing[0] != 255) || + (initialized_with_one_argument_missing[1] != 255) || + (initialized_with_one_argument_missing[2] != 0)) + return false; + + // But the following line would not compile + // Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } }; + + return true; +} + +static_assert(test_array_aggregate_initialization()); + +// A few compilers, such as GCC 8.4, were erroring out when the function below +// appeared in a constant expression because +// Kokkos::Array::operator[] is non-constexpr. The issue +// disappears with GCC 9.1 ( As a workaround, +// the static_assert was dropped and the [[maybe_unused]] is used as an attempt +// to silent warnings that the function is never used. +[[maybe_unused]] KOKKOS_FUNCTION void test_array_zero_sized() { + using T = float; + + // The code below must compile for zero-sized arrays. + constexpr int N = 0; + Kokkos::Array a; + for (int i = 0; i < N; ++i) { + a[i] = T(); + } +} + +constexpr bool test_array_const_qualified_element_type() { + Kokkos::Array a{255}; + return a[0] == 255; +} + +static_assert(test_array_const_qualified_element_type()); + +// User-defined type providing a sepcialization of kokkos_swap +struct MyInt { + int i; + + private: + friend constexpr KOKKOS_FUNCTION void kokkos_swap(MyInt& lhs, + MyInt& rhs) noexcept { + lhs.i = 255; + rhs.i = 127; + } +}; + +constexpr bool test_array_specialization_kokkos_swap() { + Kokkos::Array a{MyInt{1}, MyInt{2}}; + Kokkos::Array b{MyInt{11}, MyInt{22}}; + + // sanity check + if (a[0].i != 1 || a[1].i != 2 || b[0].i != 11 || b[1].i != 22) { + return false; + } + + using Kokkos::kokkos_swap; + kokkos_swap(a, b); + + // check that the user-definied kokkos_swap(MyInt) overload was called + if (a[0].i != 255 || a[1].i != 255 || b[0].i != 127 || b[1].i != 127) { + return false; + } + + return true; +} + +static_assert(test_array_specialization_kokkos_swap()); + +constexpr bool test_to_array() { + // copies a string literal + [[maybe_unused]] auto a1 = Kokkos::to_array("foo"); + static_assert(a1.size() == 4); + maybe_unused(a1); + + // deduces both element type and length + [[maybe_unused]] auto a2 = Kokkos::to_array({0, 2, 1, 3}); + static_assert(std::is_same_v>); + maybe_unused(a2); + +// gcc8, icc, and nvcc 11.3 do not support the implicit conversion +#if !(defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 910)) && \ + !(defined(KOKKOS_COMPILER_INTEL) && (KOKKOS_COMPILER_INTEL < 2021)) && \ + !(defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140)) + // deduces length with element type specified + // implicit conversion happens + [[maybe_unused]] auto a3 = Kokkos::to_array({0, 1, 3}); + static_assert(std::is_same_v>); + maybe_unused(a3); +#endif + + return true; +} + +static_assert(test_to_array()); + +constexpr bool test_array_equality_comparable() { + using C0 = Kokkos::Array; + using C2 = Kokkos::Array; + using C3 = Kokkos::Array; + using I0 = Kokkos::Array; + using I2 = Kokkos::Array; + using I3 = Kokkos::Array; + + static_assert(Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + + static_assert(!Kokkos::is_detected_v); + static_assert(Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(!Kokkos::is_detected_v); + static_assert(Kokkos::is_detected_v); + + return true; +} + +static_assert(test_array_equality_comparable()); + } // namespace diff --git a/lib/kokkos/core/unit_test/TestArrayOps.hpp b/lib/kokkos/core/unit_test/TestArrayOps.hpp index 06528572714..29a452b660c 100644 --- a/lib/kokkos/core/unit_test/TestArrayOps.hpp +++ b/lib/kokkos/core/unit_test/TestArrayOps.hpp @@ -92,6 +92,31 @@ TEST(TEST_CATEGORY, array_element_access) { ASSERT_EQ([index], a[index]); } +TEST(TEST_CATEGORY, array_operator_equal) { + using A = Kokkos::Array; + constexpr A a{{3, 5}}; + constexpr A b{{3, 5}}; + constexpr A c{{5, 3}}; + + static_assert(a == b); + static_assert(!(a == c)); + static_assert(a != c); + + ASSERT_TRUE(a == b); + ASSERT_FALSE(a == c); + ASSERT_TRUE(a != c); + + using E = Kokkos::Array; + constexpr E e; + constexpr E f; + + static_assert(e == f); + static_assert(!(e != f)); + + ASSERT_TRUE(e == f); + ASSERT_FALSE(e != f); +} + TEST(TEST_CATEGORY, array_zero_capacity) { using A = Kokkos::Array; A e; @@ -111,6 +136,8 @@ TEST(TEST_CATEGORY, array_zero_data_nullptr) { ASSERT_EQ(, nullptr); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() TEST(TEST_CATEGORY, array_contiguous_capacity) { using A = Kokkos::Array::contiguous>; @@ -389,5 +416,7 @@ TEST(TEST_CATEGORY, array_strided_assignment) { ASSERT_EQ(e.max_size(), std::size(ee) / eStride); ASSERT_EQ(e[0], ee[0]); } +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif } // namespace diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp index cd7ba47aa1e..957ba9a7aa0 100644 --- a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp +++ b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp @@ -459,9 +459,11 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { case 12: return true; #else case 11: - return update_in >= 0 ? atomic_op_test( - old_val, update) - : true; + return (std::make_signed_t(update_in) >= 0 && + std::make_signed_t(old_val) >= 0) + ? atomic_op_test(old_val, + update) + : true; case 12: return update_in >= 0 ? atomic_op_test( old_val, update) diff --git a/lib/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp b/lib/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp index 2f3bcfe817d..fe015404f1b 100644 --- a/lib/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/lib/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp @@ -827,12 +827,6 @@ struct TestBitCastFunction { } } -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if constexpr (std::is_same_v) { - return; - } -#endif struct S { int i; diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp index 5501a35b7f0..ef6a21cd370 100644 --- a/lib/kokkos/core/unit_test/TestComplex.hpp +++ b/lib/kokkos/core/unit_test/TestComplex.hpp @@ -15,9 +15,26 @@ //@HEADER #include -#include #include +// Suppress "'long double' is treated as 'double' in device code" +#ifdef KOKKOS_COMPILER_NVCC +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress 20208 +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic push +#pragma diag_suppress 20208 +#endif +#endif +#endif + +namespace { +template +KOKKOS_FUNCTION constexpr void maybe_unused(Ts &&...) noexcept {} +} // namespace + namespace Test { // Test construction and assignment @@ -532,4 +549,151 @@ TEST(TEST_CATEGORY, complex_operations_arithmetic_types_overloads) { Kokkos::complex>::value)); } +template +struct TestComplexStructuredBindings { + using exec_space = ExecSpace; + using value_type = double; + using complex_type = Kokkos::complex; + using device_view_type = Kokkos::View; + using host_view_type = typename device_view_type::HostMirror; + + device_view_type d_results; + host_view_type h_results; + + // tuple_size + static_assert(std::is_same_v::type, + std::integral_constant>); + + // tuple_element + static_assert( + std::is_same_v, value_type>); + static_assert( + std::is_same_v, value_type>); + + static void testgetreturnreferencetypes() { + complex_type m; + const complex_type c; + + // get lvalue + complex_type &ml = m; + static_assert(std::is_same_v(ml)), value_type &>); + static_assert(std::is_same_v(ml)), value_type &>); + + // get rvalue + complex_type &&mr = std::move(m); + static_assert( + std::is_same_v(std::move(mr))), value_type &&>); + static_assert( + std::is_same_v(std::move(mr))), value_type &&>); + + // get const lvalue + const complex_type &cl = c; + static_assert( + std::is_same_v(cl)), value_type const &>); + static_assert( + std::is_same_v(cl)), value_type const &>); + + // get const rvalue + complex_type const &&cr = std::move(c); + static_assert(std::is_same_v(std::move(cr))), + value_type const &&>); + static_assert(std::is_same_v(std::move(cr))), + value_type const &&>); + + maybe_unused(m, c, ml, mr, cl, cr); + } + + void testit() { + testgetreturnreferencetypes(); + + d_results = device_view_type("TestComplexStructuredBindings", 6); + h_results = Kokkos::create_mirror_view(d_results); + + Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), *this); + Kokkos::fence(); + Kokkos::deep_copy(h_results, d_results); + + // get lvalue + ASSERT_FLOAT_EQ(h_results[0].real(), 2.); + ASSERT_FLOAT_EQ(h_results[0].imag(), 3.); + + // get rvalue + ASSERT_FLOAT_EQ(h_results[1].real(), 2.); + ASSERT_FLOAT_EQ(h_results[1].imag(), 3.); + + // get const lvalue + ASSERT_FLOAT_EQ(h_results[2].real(), 5.); + ASSERT_FLOAT_EQ(h_results[2].imag(), 7.); + + // get const rvalue + ASSERT_FLOAT_EQ(h_results[3].real(), 5.); + ASSERT_FLOAT_EQ(h_results[3].imag(), 7.); + + // swap real and imaginary + ASSERT_FLOAT_EQ(h_results[4].real(), 11.); + ASSERT_FLOAT_EQ(h_results[4].imag(), 13.); + ASSERT_FLOAT_EQ(h_results[5].real(), 13.); + ASSERT_FLOAT_EQ(h_results[5].imag(), 11.); + } + + KOKKOS_FUNCTION + void operator()(int) const { + complex_type m(2., 3.); + const complex_type c(5., 7.); + + // get lvalue + { + complex_type &ml = m; + auto &[mlr, mli] = ml; + d_results[0] = complex_type(mlr, mli); + } + + // get rvalue + { + complex_type &&mr = std::move(m); + auto &&[mrr, mri] = std::move(mr); + d_results[1] = complex_type(mrr, mri); + } + + // get const lvalue + { + const complex_type &cl = c; + auto &[clr, cli] = cl; + d_results[2] = complex_type(clr, cli); + } + + // get const rvalue + { + complex_type const &&cr = std::move(c); + auto &&[crr, cri] = std::move(cr); + d_results[3] = complex_type(crr, cri); + } + + // swap real and imaginary + { + complex_type z(11., 13.); + d_results[4] = z; + + auto &[zr, zi] = z; + Kokkos::kokkos_swap(zr, zi); + d_results[5] = z; + } + } +}; + +TEST(TEST_CATEGORY, complex_structured_bindings) { + TestComplexStructuredBindings test; + test.testit(); +} + } // namespace Test + +#ifdef KOKKOS_COMPILER_NVCC +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic pop +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic pop +#endif +#endif +#endif diff --git a/lib/kokkos/core/unit_test/TestExecSpaceThreadSafety.hpp b/lib/kokkos/core/unit_test/TestExecSpaceThreadSafety.hpp new file mode 100644 index 00000000000..a83355c51fe --- /dev/null +++ b/lib/kokkos/core/unit_test/TestExecSpaceThreadSafety.hpp @@ -0,0 +1,327 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +#ifdef KOKKOS_ENABLE_OPENMP +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { +#pragma omp parallel num_threads(2) + { + if (omp_get_thread_num() == 0) l1(); + if (omp_get_thread_num() == 1) l2(); + } +} +// We cannot run the multithreaded test when threads or HPX is enabled because +// we cannot launch a thread from inside another thread +#elif !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_HPX) +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + std::thread t1(l1); + std::thread t2(l2); + t1.join(); + t2.join(); +} +#else +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + l1(); + l2(); +} +#endif + +// The idea for all of these tests is to access a View from kernels submitted by +// two different threads to the same execution space instance. If the kernels +// are executed concurrently, we expect to count too many increments. +void run_exec_space_thread_safety_range() { + constexpr int N = 10000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::RangePolicy(exec, 0, 1), KOKKOS_LAMBDA(int) { + Kokkos::atomic_store(, 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(; + if (Kokkos::atomic_load( != N) + Kokkos::atomic_store(, 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range) { +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenACC"; +#endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_range(); +} + +void run_exec_space_thread_safety_mdrange() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int) { + Kokkos::atomic_store(, 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(; + if (Kokkos::atomic_load( != N) + Kokkos::atomic_store(, 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_mdrange(); +} + +void run_exec_space_thread_safety_team_policy() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member) { + Kokkos::single(Kokkos::PerTeam(team_member), [=]() { + Kokkos::atomic_store(, 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(; + if (Kokkos::atomic_load( != N) + Kokkos::atomic_store(, 1); + }); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + run_exec_space_thread_safety_team_policy(); +} + +void run_exec_space_thread_safety_range_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &update) { + Kokkos::atomic_store(, 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(; + if (Kokkos::atomic_load( != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_reduce) { + run_exec_space_thread_safety_range_reduce(); +} + +void run_exec_space_thread_safety_mdrange_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int, int &update) { + Kokkos::atomic_store(, 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(; + if (Kokkos::atomic_load( != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange_reduce) { +// FIXME_INTEL +#if defined(KOKKOS_COMPILER_INTEL) && defined(KOKKOS_ENABLE_OPENMP) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMP using the " + "legacy Intel compiler"; +#endif + run_exec_space_thread_safety_mdrange_reduce(); +} + +void run_exec_space_thread_safety_team_policy_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member, + int &update) { + Kokkos::single(Kokkos::PerTeam(team_member), [=, &update]() { + Kokkos::atomic_store(, 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(; + if (Kokkos::atomic_load( != N) ++update; + }); + }, + error); + } + }; + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy_reduce) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + // FIXME_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is know to fail with SYCL+Cuda"; +#endif + run_exec_space_thread_safety_team_policy_reduce(); +} + +void run_exec_space_thread_safety_range_scan() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_scan( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &, const bool final) { + if (final) { + Kokkos::atomic_store(, 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(; + if (Kokkos::atomic_load( != N) + Kokkos::atomic_store(, 1); + } + }); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_scan) { +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenACC"; +#endif + run_exec_space_thread_safety_range_scan(); +} + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestExecutionSpace.hpp b/lib/kokkos/core/unit_test/TestExecutionSpace.hpp index 983a5975afd..d4142dee18b 100644 --- a/lib/kokkos/core/unit_test/TestExecutionSpace.hpp +++ b/lib/kokkos/core/unit_test/TestExecutionSpace.hpp @@ -44,4 +44,60 @@ TEST(TEST_CATEGORY, execution_space_as_class_data_member) { } #endif +constexpr bool test_execspace_explicit_construction() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#ifdef KOKKOS_ENABLE_SERIAL + static_assert(std::is_convertible_v); +#endif +#ifdef KOKKOS_ENABLE_OPENMP + static_assert(std::is_convertible_v); +#endif +#ifdef KOKKOS_ENABLE_CUDA + static_assert(std::is_convertible_v); +#endif +#ifdef KOKKOS_ENABLE_HIP + static_assert(std::is_convertible_v); +#endif +#ifdef KOKKOS_ENABLE_HPX + static_assert(std::is_convertible_v); + static_assert( + std::is_convertible_v&&, + Kokkos::Experimental::HPX>); +#endif +#else +#ifdef KOKKOS_ENABLE_SERIAL + static_assert(!std::is_convertible_v); +#endif +#ifdef KOKKOS_ENABLE_OPENMP + static_assert(!std::is_convertible_v); +#endif +#ifdef KOKKOS_ENABLE_CUDA + static_assert(!std::is_convertible_v); +#endif +#ifdef KOKKOS_ENABLE_HIP + static_assert(!std::is_convertible_v); +#endif +#ifdef KOKKOS_ENABLE_HPX + static_assert(!std::is_convertible_v); + static_assert(!std::is_convertible_v< + hpx::execution::experimental::unique_any_sender<>&&, + Kokkos::Experimental::HPX>); +#endif +#endif + +#ifdef KOKKOS_ENABLE_OPENACC + static_assert(!std::is_convertible_v); +#endif +#ifdef KOKKOS_ENABLE_SYCL + static_assert( + !std::is_convertible_v); +#endif + + return true; +} + +static_assert(test_execspace_explicit_construction()); + } // namespace diff --git a/lib/kokkos/core/unit_test/TestGraph.hpp b/lib/kokkos/core/unit_test/TestGraph.hpp index 9a36d08f445..f9dc63d30c4 100644 --- a/lib/kokkos/core/unit_test/TestGraph.hpp +++ b/lib/kokkos/core/unit_test/TestGraph.hpp @@ -21,6 +21,21 @@ namespace Test { +template +struct NoOpReduceFunctor { + KOKKOS_FUNCTION void operator()(int, ValueType&) const { + Kokkos::abort("Should never be called!"); + } + KOKKOS_FUNCTION void operator()(int, int, ValueType&) const { + Kokkos::abort("Should never be called!"); + } + KOKKOS_FUNCTION void operator()( + const typename Kokkos::TeamPolicy::member_type&, + ValueType&) const { + Kokkos::abort("Should never be called!"); + } +}; + template struct CountTestFunctor { using value_type = int; @@ -66,7 +81,7 @@ struct SetResultToViewFunctor { } }; -struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { +struct TEST_CATEGORY_FIXTURE(graph) : public ::testing::Test { public: using count_functor = CountTestFunctor; using set_functor = SetViewToValueFunctor; @@ -88,7 +103,7 @@ struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { } }; -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one) { auto graph = Kokkos::Experimental::create_graph([&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); @@ -101,7 +116,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one_rvalue) { Kokkos::Experimental::create_graph(ex, [&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); }).submit(); @@ -112,7 +127,17 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_six) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET team_size incompatible + if (std::is_same_v) + GTEST_SKIP() << "skipping since OpenMPTarget can't use team_size 1"; +#endif +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(SYCL_EXT_ONEAPI_GRAPH) // FIXME_SYCL + if (std::is_same_v) + GTEST_SKIP() << "skipping since test case is known to fail with SYCL"; +#endif + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); @@ -145,7 +170,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), when_all_cycle) { view_type reduction_out{"reduction_out"}; view_host reduction_host{"reduction_host"}; Kokkos::Experimental::create_graph(ex, [&](auto root) { @@ -172,7 +197,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { // This test is disabled because we don't currently support copying to host, // even asynchronously. We _may_ want to do that eventually? -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), DISABLED_repeat_chain) { auto graph = Kokkos::Experimental::create_graph( ex, [&, count_host = count_host](auto root) { //---------------------------------------- @@ -198,10 +223,27 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { //---------------------------------------- } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { - auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { - root.then_parallel_reduce(0, set_result_functor{bugs}, count); - }); +TEST_F(TEST_CATEGORY_FIXTURE(graph), zero_work_reduce) { + auto graph = Kokkos::Experimental::create_graph( + ex, [&](Kokkos::Experimental::GraphNodeRef root) { + NoOpReduceFunctor no_op_functor; + root.then_parallel_reduce(Kokkos::RangePolicy(0, 0), + no_op_functor, count) +#if !defined(KOKKOS_ENABLE_SYCL) || \ + defined(SYCL_EXT_ONEAPI_GRAPH) // FIXME_SYCL +#if !defined(KOKKOS_ENABLE_CUDA) && \ + !defined(KOKKOS_ENABLE_HIP) // FIXME_CUDA FIXME_HIP + .then_parallel_reduce( + Kokkos::MDRangePolicy>{{0, 0}, + {0, 0}}, + no_op_functor, count) +#endif + .then_parallel_reduce( + Kokkos::TeamPolicy{0, Kokkos::AUTO}, + no_op_functor, count) +#endif + ; + }); // These fences are only necessary because of the weirdness of how CUDA // UVM works on pre pascal cards. #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \ @@ -214,12 +256,15 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { // UVM works on pre pascal cards. #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \ (defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL)) - Kokkos::fence(); + if constexpr (std::is_same_v) Kokkos::fence(); +#endif +#ifdef KOKKOS_ENABLE_HPX // FIXME_HPX graph.submit() isn't properly enqueued + if constexpr (std::is_same_v) + Kokkos::fence(); #endif - graph.submit(); // should reset to 0, but doesn't + graph.submit(); Kokkos::deep_copy(ex, count_host, count); ex.fence(); ASSERT_EQ(count_host(), 0); } - } // end namespace Test diff --git a/lib/kokkos/core/unit_test/TestLocalDeepCopy.hpp b/lib/kokkos/core/unit_test/TestLocalDeepCopy.hpp index 1ee23a47c45..c6ee687cf91 100644 --- a/lib/kokkos/core/unit_test/TestLocalDeepCopy.hpp +++ b/lib/kokkos/core/unit_test/TestLocalDeepCopy.hpp @@ -907,13 +907,7 @@ void impl_test_local_deepcopy_rangepolicy_rank_7(const int N) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutleft) { using ExecSpace = TEST_EXECSPACE; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if (std::is_same_v) - GTEST_SKIP() - << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views"; -#endif - using ViewType = Kokkos::View; + using ViewType = Kokkos::View; { // Rank-1 impl_test_local_deepcopy_teampolicy_rank_1(8); @@ -940,13 +934,7 @@ TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutleft) { //------------------------------------------------------------------------------------------------------------- TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutleft) { using ExecSpace = TEST_EXECSPACE; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if (std::is_same_v) - GTEST_SKIP() - << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views"; -#endif - using ViewType = Kokkos::View; + using ViewType = Kokkos::View; { // Rank-1 impl_test_local_deepcopy_rangepolicy_rank_1(8); @@ -973,12 +961,6 @@ TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutleft) { //------------------------------------------------------------------------------------------------------------- TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutright) { using ExecSpace = TEST_EXECSPACE; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if (std::is_same_v) - GTEST_SKIP() - << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views"; -#endif using ViewType = Kokkos::View; { // Rank-1 @@ -1006,12 +988,6 @@ TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutright) { //------------------------------------------------------------------------------------------------------------- TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutright) { using ExecSpace = TEST_EXECSPACE; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if (std::is_same_v) - GTEST_SKIP() - << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views"; -#endif using ViewType = Kokkos::View; diff --git a/lib/kokkos/core/unit_test/TestMDSpan.hpp b/lib/kokkos/core/unit_test/TestMDSpan.hpp index ef0bea1394a..fa88b547a5f 100644 --- a/lib/kokkos/core/unit_test/TestMDSpan.hpp +++ b/lib/kokkos/core/unit_test/TestMDSpan.hpp @@ -35,13 +35,19 @@ void test_mdspan_minimal_functional() { Kokkos::parallel_reduce( "CheckMinimalMDSpan", Kokkos::RangePolicy(0, N), KOKKOS_LAMBDA(int i, int& err) { +#if !defined(KOKKOS_ENABLE_OPENACC) Kokkos::mdspan> b_mds(, N); -#ifdef KOKKOS_ENABLE_CXX23 +#endif +#if !defined(KOKKOS_ENABLE_CXX17) && !defined(KOKKOS_ENABLE_CXX20) if (a_mds[i] != i) err++; +#if !defined(KOKKOS_ENABLE_OPENACC) if (b_mds[i] != i) err++; +#endif #else if (a_mds(i) != i) err++; +#if !defined(KOKKOS_ENABLE_OPENACC) if (b_mds(i) != i) err++; +#endif #endif }, errors); diff --git a/lib/kokkos/core/unit_test/TestMDSpanAtomicAccessor.hpp b/lib/kokkos/core/unit_test/TestMDSpanAtomicAccessor.hpp new file mode 100644 index 00000000000..04460e64195 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestMDSpanAtomicAccessor.hpp @@ -0,0 +1,112 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include +#ifndef KOKKOS_ENABLE_CXX17 +#include +#endif + +template +void test_atomic_accessor() { + using value_type = std::remove_const_t; + Kokkos::View v("V", 100); + + Kokkos::parallel_for( + Kokkos::RangePolicy(0, v.extent(0)), + KOKKOS_LAMBDA(int i) { v(i) = i; }); + + int errors; + using acc_t = Kokkos::Impl::AtomicAccessorRelaxed; + acc_t acc{}; + typename acc_t::data_handle_type ptr =; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, v.extent(0)), + KOKKOS_LAMBDA(int i, int& error) { + if (acc.access(ptr, i) != ptr[i]) error++; + if (acc.offset(ptr, i) != ptr + i) error++; + static_assert(std::is_same_v); + static_assert( + std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_nothrow_move_constructible_v); + static_assert(std::is_nothrow_move_assignable_v); + static_assert(std::is_nothrow_swappable_v); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_default_constructible_v); + static_assert(std::is_trivially_constructible_v); + static_assert(std::is_trivially_move_constructible_v); + static_assert(std::is_trivially_assignable_v); + static_assert(std::is_trivially_move_assignable_v); +#ifndef KOKKOS_ENABLE_CXX17 + static_assert(std::copyable); + static_assert(std::is_empty_v); +#endif + }, + errors); + ASSERT_EQ(errors, 0); +} + +void test_atomic_accessor_conversion() { + using ExecutionSpace = TEST_EXECSPACE; + using T = float; + using acc_t = Kokkos::Impl::AtomicAccessorRelaxed; + using const_acc_t = Kokkos::Impl::AtomicAccessorRelaxed; + using int_acc_t = Kokkos::Impl::AtomicAccessorRelaxed; + using defacc_t = Kokkos::default_accessor; + using const_defacc_t = Kokkos::default_accessor; + using int_defacc_t = Kokkos::default_accessor; + + Kokkos::parallel_for( + Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(int) { + static_assert(std::is_constructible_v); + static_assert(std::is_convertible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(std::is_convertible_v); + static_assert(std::is_convertible_v); + static_assert(std::is_convertible_v); + static_assert(std::is_convertible_v); + static_assert(!std::is_convertible_v); + static_assert(!std::is_convertible_v); + static_assert(!std::is_convertible_v); + }); +} + +TEST(TEST_CATEGORY, mdspan_atomic_accessor) { + using ExecutionSpace = TEST_EXECSPACE; + test_atomic_accessor(); + test_atomic_accessor(); +} diff --git a/lib/kokkos/core/unit_test/TestMDSpanConversion.hpp b/lib/kokkos/core/unit_test/TestMDSpanConversion.hpp new file mode 100644 index 00000000000..10123901c43 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestMDSpanConversion.hpp @@ -0,0 +1,507 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include +#include "experimental/__p0009_bits/layout_stride.hpp" + +namespace { + +template +struct TestViewMDSpanConversion { + using value_type = T; + + template + using layout_left_padded = Kokkos::Experimental::layout_left_padded; + + template + using layout_right_padded = + Kokkos::Experimental::layout_right_padded; + + struct TestAccessor { + using offset_policy = TestAccessor; + using element_type = value_type; + using reference = element_type &; + using data_handle_type = element_type *; + + constexpr TestAccessor() noexcept = default; + constexpr reference access(data_handle_type p, std::size_t i) noexcept { + return p[i]; + } + constexpr data_handle_type offset(data_handle_type p, + std::size_t i) noexcept { + return p + i; + } + }; + + template + static void test_conversion_from_mdspan( + Kokkos::View ref, + const MDSpanLayoutMapping &mapping) { + using unmanaged_view_type = + Kokkos::View>; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename unmanaged_view_type::traits>::mdspan_type; + using mapping_type = MDSpanLayoutMapping; + using mdspan_layout_type = typename MDSpanLayoutMapping::layout_type; + using extents_type = typename mapping_type::extents_type; + using mdspan_type = + Kokkos::mdspan; + + static_assert(std::is_constructible_v); + static_assert(std::is_convertible_v == + std::is_convertible_v); + // Manually create an mdspan from ref so we have a valid pointer to play + // with + const auto &exts = mapping.extents(); + auto mds = mdspan_type{, mapping}; + + auto test_view = unmanaged_view_type(mds); + + ASSERT_EQ(,; + ASSERT_EQ(, mds.data_handle()); + ASSERT_EQ(test_view.layout(), ref.layout()); + for (std::size_t r = 0; r < mdspan_type::rank(); ++r) { + ASSERT_EQ(test_view.extent(r), ref.extent(r)); + ASSERT_EQ(test_view.extent(r), exts.extent(r)); + } + } + + template + static void test_conversion_to_mdspan( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v) { + using view_type = ViewType; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename view_type::traits>::mdspan_type; + + static_assert(natural_mdspan_type::rank() == view_type::rank); + static_assert(std::is_same_v); + constexpr bool is_strided_layout = + std::is_same_v; + if constexpr (!is_strided_layout) { + static_assert(natural_mdspan_type::mapping_type::padding_value == + Kokkos::dynamic_extent); + } + // test conversion operator to natural mdspan + { + natural_mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(),; + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + + if constexpr (!is_strided_layout && natural_mdspan_type::rank() > 1) { + ASSERT_EQ(cvt.mapping().stride(1), ref_layout_mapping.stride(1)); + } + } + // test to_mdspan() returning natural mdspan + { + auto cvt = v.to_mdspan(); + static_assert(std::is_same_v); + ASSERT_EQ(cvt.data_handle(),; + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + // test conversion operator to different mdspan type + { + using element_type = const typename natural_mdspan_type::element_type; + using const_acc_type = Kokkos::Impl::SpaceAwareAccessor< + typename ViewType::memory_space, + Kokkos::default_accessor>; + using mdspan_type = Kokkos::mdspan< + element_type, + Kokkos::dextents, + typename natural_mdspan_type::layout_type, const_acc_type>; + mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(),; + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + } + + template + static void test_conversion_to_mdspan_with_accessor( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v, + const AccessorType &a) { + auto cvt = v.to_mdspan(a); + static_assert(decltype(cvt)::rank() == ViewType::rank); + static_assert(std::is_same_v); + ASSERT_EQ(cvt.data_handle(),; + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + + template + using natural_mdspan_type_for_view = typename Kokkos::Impl::MDSpanViewTraits< + typename ViewType::traits>::mdspan_type; + + static void run_test() { + // Verify we can only convert to compatible mdspans + static_assert(std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + static_assert( + std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Do not cast const away + static_assert(!std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Mismatched dim + static_assert(!std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Mismatched layouts + static_assert( + !std::is_convertible_v, + natural_mdspan_type_for_view>>); + static_assert( + !std::is_convertible_v, + natural_mdspan_type_for_view>>); + // nvcc doesn't do CTAD properly here, making this way more verbose.. + // LayoutLeft + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + test_conversion_from_mdspan( + Kokkos::View("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7, 3)}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7, 3)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + // LayoutRight + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + test_conversion_from_mdspan( + Kokkos::View("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(3, 7)}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(3, 7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + // LayoutStride + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::dextents{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, {}, strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::dextents{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + + // Conversion to mdspan + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4)); + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7)); + + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", + 4)); + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7)); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5})); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9})); + } + + // Aligned types (for padded layouts) + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 128), + Kokkos::View( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 127, 7)); + + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 128), + Kokkos::View( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 7, 127)); + + // Conversion with standard default_accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + Kokkos::default_accessor{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7), + Kokkos::default_accessor{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + Kokkos::default_accessor{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7), + Kokkos::default_accessor{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5}), + Kokkos::default_accessor{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + Kokkos::default_accessor{}); + } + + // Conversion with a test accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7), + TestAccessor{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7), + TestAccessor{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5}), + TestAccessor{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + TestAccessor{}); + } + } +}; + +TEST(TEST_CATEGORY, view_mdspan_conversion) { + TestViewMDSpanConversion::run_test(); + TestViewMDSpanConversion::run_test(); + TestViewMDSpanConversion::run_test(); +} + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestMathematicalConstants.hpp b/lib/kokkos/core/unit_test/TestMathematicalConstants.hpp index e446d813210..f52bfeaff7d 100644 --- a/lib/kokkos/core/unit_test/TestMathematicalConstants.hpp +++ b/lib/kokkos/core/unit_test/TestMathematicalConstants.hpp @@ -63,8 +63,7 @@ struct TestMathematicalConstants { KOKKOS_FUNCTION void use_on_device() const { #if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \ - defined(KOKKOS_ENABLE_OPENACC) || \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 + defined(KOKKOS_ENABLE_OPENACC) take_by_value(Trait::value); #else (void)take_address_of(Trait::value); diff --git a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp index ad035d4e4bf..f996c61a527 100644 --- a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp +++ b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp @@ -1585,34 +1585,24 @@ struct TestIsFinite { Kokkos::printf("failed isfinite(float)\n"); } #if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) - if (!isfinite(static_cast(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isfinite(quiet_NaN::value) || + if (!isfinite(static_cast(2.f)) || + isfinite(quiet_NaN::value) || isfinite(signaling_NaN::value) || - isfinite(infinity::value) -#endif - ) { + isfinite(infinity::value)) { ++e; Kokkos::printf("failed isfinite(KE::half_t)\n"); } - if (!isfinite(static_cast(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isfinite(quiet_NaN::value) || + if (!isfinite(static_cast(2.f)) || + isfinite(quiet_NaN::value) || isfinite(signaling_NaN::value) || - isfinite(infinity::value) -#endif - ) { + isfinite(infinity::value)) { ++e; Kokkos::printf("failed isfinite(KE::bhalf_t)\n"); } #endif - if (!isfinite(3.) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isfinite(quiet_NaN::value) || + if (!isfinite(3.) || isfinite(quiet_NaN::value) || isfinite(signaling_NaN::value) || - isfinite(infinity::value) -#endif - ) { + isfinite(infinity::value)) { ++e; Kokkos::printf("failed isfinite(double)\n"); } @@ -1666,33 +1656,24 @@ struct TestIsInf { Kokkos::printf("failed isinf(float)\n"); } #if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) - if (isinf(static_cast(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isinf(quiet_NaN::value) || + if (isinf(static_cast(2.f)) || + isinf(quiet_NaN::value) || isinf(signaling_NaN::value) || - !isinf(infinity::value) -#endif - ) { + !isinf(infinity::value)) { ++e; Kokkos::printf("failed isinf(KE::half_t)\n"); } - if (isinf(static_cast(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isinf(quiet_NaN::value) || + if (isinf(static_cast(2.f)) || + isinf(quiet_NaN::value) || isinf(signaling_NaN::value) || - !isinf(infinity::value) -#endif - ) { + !isinf(infinity::value)) { ++e; Kokkos::printf("failed isinf(KE::bhalf_t)\n"); } #endif - if (isinf(3.) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isinf(quiet_NaN::value) || - isinf(signaling_NaN::value) || !isinf(infinity::value) -#endif - ) { + if (isinf(3.) || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || + !isinf(infinity::value)) { ++e; Kokkos::printf("failed isinf(double)\n"); } @@ -1746,32 +1727,23 @@ struct TestIsNaN { Kokkos::printf("failed isnan(float)\n"); } #if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) - if (isnan(static_cast(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || !isnan(quiet_NaN::value) || + if (isnan(static_cast(2.f)) || + !isnan(quiet_NaN::value) || !isnan(signaling_NaN::value) || - isnan(infinity::value) -#endif - ) { + isnan(infinity::value)) { ++e; Kokkos::printf("failed isnan(KE::half_t)\n"); } - if (isnan(static_cast(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || !isnan(quiet_NaN::value) || + if (isnan(static_cast(2.f)) || + !isnan(quiet_NaN::value) || !isnan(signaling_NaN::value) || - isnan(infinity::value) -#endif - ) { + isnan(infinity::value)) { ++e; Kokkos::printf("failed isnan(KE::bhalf_t)\n"); } - if (isnan(3.) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value) || isnan(infinity::value) -#endif - ) { + if (isnan(3.) || !isnan(quiet_NaN::value) || + !isnan(signaling_NaN::value) || + isnan(infinity::value)) { ++e; Kokkos::printf("failed isnan(double)\n"); } diff --git a/lib/kokkos/core/unit_test/TestMultiGPU.hpp b/lib/kokkos/core/unit_test/TestMultiGPU.hpp new file mode 100644 index 00000000000..aad2fa45f49 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestMultiGPU.hpp @@ -0,0 +1,184 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, + TEST_EXECSPACE exec, Kokkos::View v) { + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + exec.fence(); + exec0.fence(); + + Kokkos::deep_copy(exec, v, 5); + Kokkos::deep_copy(exec0, v0, 5); + + Kokkos::deep_copy(v, v0); + + int sum; + int sum0; + + Kokkos::parallel_for("Test::Range_0", + Kokkos::RangePolicy(exec0, 0, 100), + Test::FunctorRange(v0)); + Kokkos::parallel_for("Test::Range", + Kokkos::RangePolicy(exec, 0, 100), + Test::FunctorRange(v)); + exec.fence(); + exec0.fence(); + Kokkos::parallel_reduce( + "Test::RangeReduce_0", + Kokkos::RangePolicy>(exec0, + 0, 100), + Test::FunctorRangeReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::RangeReduce", + Kokkos::RangePolicy>(exec, 0, + 100), + Test::FunctorRangeReduce(v), sum); + ASSERT_EQ(600, sum0); + ASSERT_EQ(600, sum); + + Kokkos::parallel_for("Test::MDRange_0", + Kokkos::MDRangePolicy>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRange(v0)); + Kokkos::parallel_for("Test::MDRange", + Kokkos::MDRangePolicy>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRange(v)); + Kokkos::parallel_reduce("Test::MDRangeReduce_0", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v0), sum0); + Kokkos::parallel_reduce("Test::MDRangeReduce", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v), sum); + ASSERT_EQ(700, sum0); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::Team_0", + Kokkos::TeamPolicy(exec0, 10, 10), + Test::FunctorTeam(v0)); + Kokkos::parallel_for("Test::Team", + Kokkos::TeamPolicy(exec, 10, 10), + Test::FunctorTeam(v)); + Kokkos::parallel_reduce( + "Test::Team_0", + Kokkos::TeamPolicy>(exec0, + 10, 10), + Test::FunctorTeamReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::Team", + Kokkos::TeamPolicy>(exec, 10, + 10), + Test::FunctorTeamReduce(v), sum); + ASSERT_EQ(800, sum0); + ASSERT_EQ(800, sum); +} + +struct ScratchFunctor { + int scratch_size; + int R; + + ScratchFunctor(int scratch_size_, int R_) + : scratch_size(scratch_size_), R(R_) {} + + KOKKOS_FUNCTION + void operator()(const Kokkos::TeamPolicy::member_type &team, + int &error_accum) const { + Kokkos::View scratch_mem( + team.team_scratch(1), scratch_size); + + // Initialize scratch memory + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) = 0; }); + team.team_barrier(); + + // Increment each entry in scratch memory R times + for (int r = 0; r < R; ++r) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) += 1; }); + } + team.team_barrier(); + + // Check that each scratch entry has been incremented exactly R times + int team_error_accum; + auto R_loc = R; // avoid implicit capture of this + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i, int &tsum) { + if (scratch_mem(i) != R_loc) { + tsum += 1; + } + }, + team_error_accum); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { error_accum += team_error_accum; }); + } +}; + +void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { + constexpr int N = 10; + constexpr int R = 1000; + constexpr int scratch_size = 100; + using ScratchType = Kokkos::View; + + // Test allocating and using scratch space + ScratchFunctor f(scratch_size, R); + + auto policy0 = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + auto policy1 = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + + int error0, error1; + + Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); + Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); + + // Request larger scratch size to trigger a realloc and test + const auto new_scratch_size = scratch_size + 10; + ScratchFunctor f_more_scratch(new_scratch_size, R); + + auto policy0_more_scratch = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + auto policy1_more_scratch = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + + Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, + f_more_scratch, error0); + Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, + f_more_scratch, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); +} +} // namespace diff --git a/lib/kokkos/core/unit_test/TestNestedReducerCTAD.cpp b/lib/kokkos/core/unit_test/TestNestedReducerCTAD.cpp new file mode 100644 index 00000000000..95493a58742 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestNestedReducerCTAD.cpp @@ -0,0 +1,246 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +struct TestNestedReducerCTAD { + using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space; + using ScalarType = int; + using IndexType = int; + using TeamPolicy = Kokkos::TeamPolicy; + using TeamHandle = TeamPolicy::member_type; + + struct FakeComparator { + template + KOKKOS_FUNCTION bool operator()(T const&, T const&) const { + return true; + } + }; + + template + struct FakeFunctor { + KOKKOS_FUNCTION void operator()(int, ValueType&) const {} + }; + + template + KOKKOS_FUNCTION static void check_types([ + [maybe_unused]] ReducerTypeToCheck const& reducer) { + static_assert(std::is_same_v); + } + + KOKKOS_FUNCTION void operator()([ + [maybe_unused]] TeamHandle const& team_handle) const { + { + using ReducerTypeExpected = Kokkos::Sum; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::Sum reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::Prod; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::Prod reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::Min; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::Min reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::Max; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::Max reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::LAnd; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::LAnd reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::LOr; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::LOr reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::BAnd; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::BAnd reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::BOr; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::BOr reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinLoc; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::MinLoc reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MaxLoc; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::MaxLoc reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::MinMax; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::MinMax reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinMaxLoc; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::MinMaxLoc reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MaxFirstLoc; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::MaxFirstLoc reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MaxFirstLocCustomComparator; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + FakeComparator comparator; + Kokkos::MaxFirstLocCustomComparator reducer(view, comparator); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinFirstLoc; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::MinFirstLoc reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinFirstLocCustomComparator; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + FakeComparator comparator; + Kokkos::MinFirstLocCustomComparator reducer(view, comparator); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinMaxFirstLastLoc; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::MinMaxFirstLastLoc reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::MinMaxFirstLastLocCustomComparator< + ScalarType, IndexType, FakeComparator, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + FakeComparator comparator; + Kokkos::MinMaxFirstLastLocCustomComparator reducer(view, comparator); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::FirstLoc; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::FirstLoc reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = Kokkos::LastLoc; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::LastLoc reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::StdIsPartitioned; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::StdIsPartitioned reducer(view); + check_types(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::StdPartitionPoint; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View view; + Kokkos::StdPartitionPoint reducer(view); + check_types(reducer); + } + } + + TestNestedReducerCTAD() { + Kokkos::parallel_for(TeamPolicy(0, Kokkos::AUTO), *this); + } +}; + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestNumericTraits.hpp b/lib/kokkos/core/unit_test/TestNumericTraits.hpp index 81a9d0a5e0d..0c803354883 100644 --- a/lib/kokkos/core/unit_test/TestNumericTraits.hpp +++ b/lib/kokkos/core/unit_test/TestNumericTraits.hpp @@ -21,6 +21,19 @@ #include #include "Kokkos_NumericTraits.hpp" +// Suppress "'long double' is treated as 'double' in device code" +#ifdef KOKKOS_COMPILER_NVCC +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress 20208 +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic push +#pragma diag_suppress 20208 +#endif +#endif +#endif + struct extrema { #define DEFINE_EXTREMA(T, m, M) \ KOKKOS_FUNCTION static T min(T) { return m; } \ @@ -145,33 +158,25 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(MaxExponent10, int, int&) const { use_on_device(); } // clang-format on KOKKOS_FUNCTION void operator()(QuietNaN, int, int& e) const { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 nan using Kokkos::Experimental::quiet_NaN; constexpr auto nan = quiet_NaN::value; auto const zero = T(0); e += (int)!(nan != nan); e += (int)!(nan != zero); -#else - (void)e; -#endif use_on_device(); } KOKKOS_FUNCTION void operator()(SignalingNaN, int, int& e) const { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 nan using Kokkos::Experimental::signaling_NaN; constexpr auto nan = signaling_NaN::value; auto const zero = T(0); e += (int)!(nan != nan); e += (int)!(nan != zero); -#else - (void)e; -#endif use_on_device(); } KOKKOS_FUNCTION void use_on_device() const { -#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_NVHPC) || \ - defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_OPENACC) +#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \ + defined(KOKKOS_ENABLE_OPENACC) take_by_value(trait::value); #else (void)take_address_of(trait::value); @@ -204,59 +209,46 @@ struct TestNumericTraits< #endif TEST(TEST_CATEGORY, numeric_traits_infinity) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 TestNumericTraits(); TestNumericTraits(); -#endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double // FIXME_OPENMPTARGET long double on Intel GPUs -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } TEST(TEST_CATEGORY, numeric_traits_epsilon) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type TestNumericTraits(); TestNumericTraits(); -#endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } TEST(TEST_CATEGORY, numeric_traits_round_error) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type TestNumericTraits(); TestNumericTraits(); -#endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } TEST(TEST_CATEGORY, numeric_traits_norm_min) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type TestNumericTraits(); TestNumericTraits(); -#endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -264,9 +256,8 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { TEST(TEST_CATEGORY, numeric_traits_denorm_min) { TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -303,10 +294,8 @@ TEST(TEST_CATEGORY, numeric_traits_finite_min_max) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double // FIXME_OPENMPTARGET long double on Intel GPUs -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -329,10 +318,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double // FIXME_OPENMPTARGET long double on Intel GPUs -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -354,10 +341,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double // FIXME_OPENMPTARGET long double on Intel GPUs -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -365,10 +350,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TEST(TEST_CATEGORY, numeric_traits_max_digits10) { TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double // FIXME_OPENMPTARGET long double on Intel GPUs -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -389,10 +372,8 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double // FIXME_OPENMPTARGET long double on Intel GPUs -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -406,10 +387,8 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double // FIXME_OPENMPTARGET long double on Intel GPUs -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -420,31 +399,29 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double // FIXME_OPENMPTARGET long double on Intel GPUs -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif } TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 +// FIXME_NVHPC +#ifdef KOKKOS_COMPILER_NVHPC + GTEST_SKIP() << "This test is known to fail with the NVHPC compiler"; +#endif TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#endif TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC 23.7 long double // FIXME_OPENMPTARGET long double on Intel GPUs -#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ - (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -736,3 +713,13 @@ CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(signaling_NaN); #undef CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT #undef CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES + +#ifdef KOKKOS_COMPILER_NVCC +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic pop +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic pop +#endif +#endif +#endif diff --git a/lib/kokkos/core/unit_test/TestOther.hpp b/lib/kokkos/core/unit_test/TestOther.hpp index fcf0353a88c..9daef3ca3f3 100644 --- a/lib/kokkos/core/unit_test/TestOther.hpp +++ b/lib/kokkos/core/unit_test/TestOther.hpp @@ -16,13 +16,8 @@ #ifndef KOKKOS_TEST_OTHER_HPP #define KOKKOS_TEST_OTHER_HPP -#include #include #include #include -// with VS 16.11.3 and CUDA 11.4.2 getting cudafe stackoverflow crash -#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) -#include -#endif #endif diff --git a/lib/kokkos/core/unit_test/TestRangePolicyConstructors.hpp b/lib/kokkos/core/unit_test/TestRangePolicyConstructors.hpp index c8c1542af13..d6920beed04 100644 --- a/lib/kokkos/core/unit_test/TestRangePolicyConstructors.hpp +++ b/lib/kokkos/core/unit_test/TestRangePolicyConstructors.hpp @@ -20,6 +20,7 @@ #include #include +#include namespace { @@ -196,4 +197,43 @@ TEST(TEST_CATEGORY_DEATH, range_policy_implicitly_converted_bounds) { #endif } +constexpr bool test_chunk_size_explicit() { + using ExecutionSpace = TEST_EXECSPACE; + using Kokkos::ChunkSize; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + static_assert(std::is_convertible_v); + static_assert(std::is_constructible_v); + // Some execution spaces were implicitly constructible from int + // which made the constructor call ambiguous. + static_assert( + std::is_constructible_v || + std::is_constructible_v< + Kokkos::RangePolicy, int, int, int>); + static_assert(std::is_constructible_v< + Kokkos::RangePolicy, int, int, + ChunkSize>); + static_assert(std::is_constructible_v, + ExecutionSpace, int, int, int>); + static_assert(std::is_constructible_v, + ExecutionSpace, int, int, ChunkSize>); +#else + static_assert(!std::is_convertible_v); + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v< + Kokkos::RangePolicy, int, int, int>); + static_assert(std::is_constructible_v< + Kokkos::RangePolicy, int, int, + ChunkSize>); + static_assert(!std::is_constructible_v, + ExecutionSpace, int, int, int>); + static_assert(std::is_constructible_v, + ExecutionSpace, int, int, ChunkSize>); +#endif + return true; +} + +static_assert(test_chunk_size_explicit()); + } // namespace diff --git a/lib/kokkos/core/unit_test/TestRealloc.hpp b/lib/kokkos/core/unit_test/TestRealloc.hpp index 2c9dc5ee473..f30c9e15e1c 100644 --- a/lib/kokkos/core/unit_test/TestRealloc.hpp +++ b/lib/kokkos/core/unit_test/TestRealloc.hpp @@ -144,6 +144,11 @@ void impl_testRealloc() { EXPECT_EQ(oldPointer, newPointer); } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; template void testRealloc() { @@ -154,6 +159,14 @@ void testRealloc() { impl_testRealloc(); // without data initialization } + // Check #6992 fix (no default initialization in realloc without initializing) + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + realloc_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewRealloc diff --git a/lib/kokkos/core/unit_test/TestResize.hpp b/lib/kokkos/core/unit_test/TestResize.hpp index 13d7e16d589..3102d2b9a16 100644 --- a/lib/kokkos/core/unit_test/TestResize.hpp +++ b/lib/kokkos/core/unit_test/TestResize.hpp @@ -358,6 +358,12 @@ void impl_testResize() { } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; + template void testResize() { { @@ -367,6 +373,13 @@ void testResize() { impl_testResize(); // without data initialization } + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + resize_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewResize diff --git a/lib/kokkos/core/unit_test/TestSpaceAwareAccessor.hpp b/lib/kokkos/core/unit_test/TestSpaceAwareAccessor.hpp new file mode 100644 index 00000000000..2fad17cb854 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestSpaceAwareAccessor.hpp @@ -0,0 +1,156 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include +#ifndef KOKKOS_ENABLE_CXX17 +#include +#endif + +template +struct funky_data_handle { + T* val; + + KOKKOS_FUNCTION + operator T*() { return val; } + KOKKOS_FUNCTION + operator const T*() const { return val; } +}; + +template +struct FunkyAcc { + using element_type = ElementType; + using reference = std::conditional_t, + element_type, element_type&>; + using data_handle_type = funky_data_handle; + using offset_policy = Kokkos::default_accessor; + KOKKOS_FUNCTION + reference access(data_handle_type p, size_t i) const { return p.val[i]; } + KOKKOS_FUNCTION + element_type* offset(data_handle_type p, size_t i) const { return p.val + i; } +}; + +template +void test_space_aware_accessor() { + using memory_space_t = MemorySpace; + using value_type = std::remove_const_t; + Kokkos::View v("V", 100); + + Kokkos::parallel_for( + Kokkos::RangePolicy(0, v.extent(0)), + KOKKOS_LAMBDA(int i) { v(i) = i; }); + + int errors; + using acc_t = Kokkos::Impl::SpaceAwareAccessor>; + acc_t acc{}; + typename acc_t::data_handle_type ptr{}; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, v.extent(0)), + KOKKOS_LAMBDA(int i, int& error) { + if (acc.access(ptr, i) != ptr[i]) error++; + if (acc.offset(ptr, i) != ptr + i) error++; + static_assert(std::is_same_v); + if constexpr (std::is_const_v) { + static_assert(std::is_same_v); + } else { + static_assert(std::is_same_v); + } + static_assert(std::is_same_v>); + static_assert( + std::is_same_v>>); + if constexpr (std::is_const_v) { + static_assert(std::is_same_v>); + } else { + static_assert(std::is_same_v); + } + static_assert(std::is_same_v); + static_assert(std::is_same_v&>); + static_assert(std::is_nothrow_move_constructible_v); + static_assert(std::is_nothrow_move_assignable_v); + static_assert(std::is_nothrow_swappable_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v>); +#ifndef KOKKOS_ENABLE_CXX17 + static_assert(std::copyable); + static_assert(std::is_empty_v); +#endif + }, + errors); + ASSERT_EQ(errors, 0); +} + +void test_space_aware_accessor_conversion() { + using ExecutionSpace = TEST_EXECSPACE; + using memory_space_t = typename ExecutionSpace::memory_space; + using T = float; + using acc_t = Kokkos::Impl::SpaceAwareAccessor>; + using const_acc_t = + Kokkos::Impl::SpaceAwareAccessor>; + using int_acc_t = + Kokkos::Impl::SpaceAwareAccessor>; + using host_acc_t = + Kokkos::Impl::SpaceAwareAccessor>; + using anon_acc_t = + Kokkos::Impl::SpaceAwareAccessor>; + + Kokkos::parallel_for( + Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(int) { + static_assert(std::is_constructible_v); + static_assert(std::is_convertible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert( + std::is_constructible_v == + Kokkos::Impl::MemorySpaceAccess::assignable); + static_assert( + std::is_constructible_v == + Kokkos::Impl::MemorySpaceAccess::assignable); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_convertible_v); + static_assert(std::is_convertible_v); + }); +} + +TEST(TEST_CATEGORY, mdspan_space_aware_accessor) { + using ExecutionSpace = TEST_EXECSPACE; + test_space_aware_accessor(); + test_space_aware_accessor(); + test_space_aware_accessor(); + test_space_aware_accessor(); + test_space_aware_accessor(); + test_space_aware_accessor(); + test_space_aware_accessor_conversion(); +} diff --git a/lib/kokkos/core/unit_test/TestSpaceAwareAccessorAccessViolation.hpp b/lib/kokkos/core/unit_test/TestSpaceAwareAccessorAccessViolation.hpp new file mode 100644 index 00000000000..b9982d5fc45 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestSpaceAwareAccessorAccessViolation.hpp @@ -0,0 +1,128 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +template +struct TestMemoryAccessViolation { + Kokkos::Impl::SpaceAwareAccessor> + acc; + + KOKKOS_FUNCTION decltype(auto) bad_access() const { + return acc.access(nullptr, 0); + } + + KOKKOS_FUNCTION void operator()(int) const { ++bad_access(); } + + TestMemoryAccessViolation(ExecutionSpace const& s, + std::string const& matcher) { + constexpr bool accessible_from_execution_space = Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/MemorySpace>::accessible; + EXPECT_FALSE(accessible_from_execution_space); + EXPECT_DEATH( + { + Kokkos::parallel_for(Kokkos::RangePolicy(s, 0, 1), + *this); + Kokkos::fence(); + }, + matcher); + } +}; + +template +void test_memory_access_violation(ExecutionSpace const& s, + std::string const& m) { + TestMemoryAccessViolation(s, m); +} + +template +void test_memory_access_violations_from_host() { + using memory_space_t = typename ExecutionSpace::memory_space; + using exec_space_t = Kokkos::DefaultHostExecutionSpace; + const exec_space_t exec_space{}; + std::string const message = + "Kokkos::SpaceAwareAccessor ERROR: attempt to access inaccessible memory " + "space"; + test_memory_access_violation(exec_space, + message); +} + +template +void test_memory_access_violations_from_device() { + using memory_space_t = Kokkos::HostSpace; + using exec_space_t = ExecutionSpace; + const exec_space_t exec_space{}; + std::string const message = + "Kokkos::SpaceAwareAccessor ERROR: attempt to access inaccessible memory " + "space"; + test_memory_access_violation(exec_space, + message); +} + +// FIXME_SYCL +#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL)) +TEST(TEST_CATEGORY_DEATH, + mdspan_space_aware_accessor_invalid_access_from_host) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + if (Kokkos::SpaceAccessibility< + /*AccessSpace=*/Kokkos::HostSpace, + /*MemorySpace=*/typename ExecutionSpace::memory_space>::accessible) { + GTEST_SKIP() << "skipping since no memory access violation would occur"; + } + + test_memory_access_violations_from_host(); +} +#endif + +TEST(TEST_CATEGORY_DEATH, + mdspan_space_aware_accessor_invalid_access_from_device) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + if (Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/Kokkos::HostSpace>::accessible) { + GTEST_SKIP() << "skipping since no memory access violation would occur"; + } + +#if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG) // FIXME_SYCL + if (std::is_same::value) { + GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG " + "is defined"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET + if (std::is_same::value) { + GTEST_SKIP() << "skipping because OpenMPTarget backend is currently not " + "able to abort from the device"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same::value) { + GTEST_SKIP() << "skipping because OpenACC backend is currently not " + "able to abort from the device"; + } +#endif + + test_memory_access_violations_from_device(); +} diff --git a/lib/kokkos/core/unit_test/TestTeamMDRangePolicyCTAD.cpp b/lib/kokkos/core/unit_test/TestTeamMDRangePolicyCTAD.cpp new file mode 100644 index 00000000000..0de639e02e6 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestTeamMDRangePolicyCTAD.cpp @@ -0,0 +1,199 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +struct TestTeamThreadMDRangeCTAD { + using TeamPolicy = Kokkos::TeamPolicy; + using TeamHandle = TeamPolicy::member_type; + + KOKKOS_FUNCTION void operator()(TeamHandle const& team_handle) const { + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + } + + TestTeamThreadMDRangeCTAD() { + Kokkos::parallel_for(TeamPolicy(0, Kokkos::AUTO), *this); + } +}; + +struct TestTeamVectorMDRangeCTAD { + using TeamPolicy = Kokkos::TeamPolicy; + using TeamHandle = TeamPolicy::member_type; + + KOKKOS_FUNCTION void operator()(TeamHandle const& team_handle) const { + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v, TeamHandle>, + decltype(md_range)>); + } + } + + TestTeamVectorMDRangeCTAD() { + Kokkos::parallel_for(TeamPolicy(0, Kokkos::AUTO), *this); + } +}; + +struct TestThreadVectorMDRangeCTAD { + using TeamPolicy = Kokkos::TeamPolicy; + using TeamHandle = TeamPolicy::member_type; + + template + KOKKOS_FUNCTION static void check_types([ + [maybe_unused]] PolicyTypeToCheck const& team_handle) { + static_assert(std::is_same_v); + } + + KOKKOS_FUNCTION void operator()(TeamHandle const& team_handle) const { + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0); + check_types, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0); + check_types, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0); + check_types, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0); + check_types, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0); + check_types, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0); + check_types, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0, 0); + check_types, TeamHandle>>( + md_range); + } + } + + TestThreadVectorMDRangeCTAD() { + Kokkos::parallel_for(TeamPolicy(0, Kokkos::AUTO), *this); + } +}; + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestTeamPolicyCTAD.cpp b/lib/kokkos/core/unit_test/TestTeamPolicyCTAD.cpp new file mode 100644 index 00000000000..07aaeae819e --- /dev/null +++ b/lib/kokkos/core/unit_test/TestTeamPolicyCTAD.cpp @@ -0,0 +1,135 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +struct TestTeamPolicyCTAD { + template + static void maybe_unused(Ts&&...) {} + + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des; + [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace + notEs; + [[maybe_unused]] static inline SomeExecutionSpace ses; + + [[maybe_unused]] static inline int i; + + // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on + // ImplicitlyConvertibleToDefaultExecutionSpace::operator + // Kokkos::DefaultExecutionSpace() const + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes = + notEs; + + // Workaround for HIP-ROCm-5.2 warning about was declared but never referenced + TestTeamPolicyCTAD() { maybe_unused(des, notEs, ses, i, notEsToDes); } + + // Default construction deduces to TeamPolicy<> + static_assert( + std::is_same_v, decltype(Kokkos::TeamPolicy{})>); + + // Execution space not provided deduces to TeamPolicy<> + + static_assert( + std::is_same_v, decltype(Kokkos::TeamPolicy(i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(i, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(i, i, Kokkos::AUTO))>); + + // DefaultExecutionSpace deduces to TeamPolicy<> + + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, i, i))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, i, Kokkos::AUTO))>); + + // Convertible to DefaultExecutionSpace deduces to TeamPolicy<> + + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, i, i))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy( + notEs, i, Kokkos::AUTO, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, i, Kokkos::AUTO))>); + + // SES != DefaultExecutionSpace deduces to TeamPolicy + + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, i, i))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, i, Kokkos::AUTO))>); +}; + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp index 5e16539d652..4d8f42720d8 100644 --- a/lib/kokkos/core/unit_test/TestTeamVector.hpp +++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp @@ -1060,11 +1060,8 @@ TEST(TEST_CATEGORY, parallel_scan_with_reducers) { constexpr int n = 1000000; constexpr int n_vector_range = 100; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if constexpr (std::is_same_v) { - GTEST_SKIP() << "All but max inclusive scan differ at index 101"; - } +#ifdef KOKKOS_IMPL_32BIT + GTEST_SKIP() << "Failing KOKKOS_IMPL_32BIT"; // FIXME_32BIT #endif checkScan(0))); -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if constexpr (std::is_same_v) { - GTEST_SKIP() << "Disabling 2/3rd of the test for now"; - } -#endif ASSERT_TRUE((TestTeamVectorRange::Test(1))); // FIXME_OPENMPTARGET - Use of kokkos reducers currently results in runtime // memory errors. diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp index ca098dbc247..53c1f016789 100644 --- a/lib/kokkos/core/unit_test/TestViewAPI.hpp +++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp @@ -837,18 +837,15 @@ struct TestViewMirror { view_const_cast(v)); } - template + template struct CopyUnInit { - using mirror_view_type = typename Kokkos::Impl::MirrorViewType< - Space, double *, Layout, Kokkos::HostSpace, MemoryTraits>::view_type; - - mirror_view_type a_d; + View a_d; KOKKOS_INLINE_FUNCTION - CopyUnInit(mirror_view_type &a_d_) : a_d(a_d_) {} + explicit CopyUnInit(View const &a_d_) : a_d(a_d_) {} KOKKOS_INLINE_FUNCTION - void operator()(const typename Space::size_type i) const { + void operator()(const typename View::size_type i) const { a_d(i) = (double)(10 - i); } }; @@ -875,7 +872,8 @@ struct TestViewMirror { Kokkos::parallel_for( Kokkos::RangePolicy(0, int(10)), - CopyUnInit(a_d)); + // decltype required for Intel classics, that doesn't recognize the CTAD + CopyUnInit(a_d)); Kokkos::deep_copy(a_h, a_d); @@ -1339,6 +1337,40 @@ class TestViewAPI { ASSERT_EQ(, nullptr); } + struct test_refcount_poison_copy_functor { + using view_type = Kokkos::View; + explicit test_refcount_poison_copy_functor(view_type v) : view(v) {} + + test_refcount_poison_copy_functor( + const test_refcount_poison_copy_functor &other) + : view(other.view) { + throw std::bad_alloc(); + } + + KOKKOS_INLINE_FUNCTION void operator()(int) const {} + + view_type view; + }; + + static void run_test_refcount_exception() { + using view_type = typename test_refcount_poison_copy_functor::view_type; + view_type original("original", N0); + ASSERT_EQ(original.use_count(), 1); + + // test_refcount_poison_copy_functor throws during copy construction + try { + Kokkos::parallel_for( + Kokkos::RangePolicy(0, N0), + test_refcount_poison_copy_functor(original)); + } catch (const std::bad_alloc &) { + } + + // Ensure refcounting is enabled, we should increment here + auto copy = original; + ASSERT_EQ(original.use_count(), 2); + ASSERT_EQ(copy.use_count(), 2); + } + static void run_test_deep_copy_empty() { // Check Deep Copy of LayoutLeft to LayoutRight { @@ -1539,56 +1571,6 @@ class TestViewAPI { typename multivector_type::const_type cmvX(cmv); typename const_multivector_type::const_type ccmvX(cmv); } - - static void run_test_error() { -#ifdef KOKKOS_ENABLE_OPENMPTARGET - if (std::is_same::value) - return; -#endif -// FIXME_MSVC_WITH_CUDA -// This test doesn't behave as expected on Windows with CUDA -#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) - return; -#endif - bool did_throw = false; - auto alloc_size = std::numeric_limits::max() - 42; - try { - auto should_always_fail = dView1("hello_world_failure", alloc_size); - } catch (std::runtime_error const &error) { - // TODO once we remove the conversion to std::runtime_error, catch the - // appropriate Kokkos error here - std::string msg = error.what(); - ASSERT_PRED_FORMAT2(::testing::IsSubstring, "hello_world_failure", msg); - ASSERT_PRED_FORMAT2(::testing::IsSubstring, - typename device::memory_space{}.name(), msg); - // Can't figure out how to make assertions either/or, so we'll just use - // an if statement here for now. Test failure message will be a bit - // misleading, but developers should figure out what's going on pretty - // quickly. - if (msg.find("is not a valid size") != std::string::npos) { - ASSERT_PRED_FORMAT2(::testing::IsSubstring, "is not a valid size", msg); - } else -#ifdef KOKKOS_ENABLE_SYCL - if (msg.find("insufficient memory") != std::string::npos) -#endif - { - ASSERT_PRED_FORMAT2(::testing::IsSubstring, "insufficient memory", msg); - } - // SYCL cannot tell the reason why a memory allocation failed -#ifdef KOKKOS_ENABLE_SYCL - else { - // Otherwise, there has to be some sort of "unknown error" error - ASSERT_PRED_FORMAT2(::testing::IsSubstring, - "because of an unknown error.", msg); - } -#endif - did_throw = true; - } - ASSERT_TRUE(did_throw); - } }; } // namespace Test diff --git a/lib/kokkos/core/unit_test/TestViewAPI_c.hpp b/lib/kokkos/core/unit_test/TestViewAPI_c.hpp index 5efbd95bc94..042da1e9842 100644 --- a/lib/kokkos/core/unit_test/TestViewAPI_c.hpp +++ b/lib/kokkos/core/unit_test/TestViewAPI_c.hpp @@ -19,6 +19,7 @@ namespace Test { TEST(TEST_CATEGORY, view_api_c) { + TestViewAPI::run_test_refcount_exception(); TestViewAPI::run_test_deep_copy_empty(); TestViewAPI::run_test_view_operator_b(); } diff --git a/lib/kokkos/core/unit_test/TestViewAPI_d.hpp b/lib/kokkos/core/unit_test/TestViewAPI_d.hpp index b0d759ffccc..075ac3329c0 100644 --- a/lib/kokkos/core/unit_test/TestViewAPI_d.hpp +++ b/lib/kokkos/core/unit_test/TestViewAPI_d.hpp @@ -26,22 +26,4 @@ TEST(TEST_CATEGORY, view_api_d) { TestViewAPI::run_test_view_operator_c(); } -TEST(TEST_CATEGORY, view_allocation_error) { -#if defined(__has_feature) -#if __has_feature(address_sanitizer) - GTEST_SKIP() << "AddressSanitzer detects allocating too much memory " - "preventing our checks to run"; -#endif -#endif -#if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) - GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memory"; -#endif -#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - if (std::is_same_v) { - GTEST_SKIP() << "acc_malloc() not properly returning nullptr"; - } -#endif - TestViewAPI::run_test_error(); -} - } // namespace Test diff --git a/lib/kokkos/core/unit_test/TestViewBadAlloc.hpp b/lib/kokkos/core/unit_test/TestViewBadAlloc.hpp new file mode 100644 index 00000000000..7cb2f91655d --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewBadAlloc.hpp @@ -0,0 +1,86 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +template +void test_view_bad_alloc() { + bool did_throw = false; + auto too_large = std::numeric_limits::max() - 42; + std::string label = "my_label"; + try { + auto should_always_fail = + Kokkos::View(label, too_large); + } catch (std::runtime_error const &error) { + std::string msg = error.what(); + ASSERT_PRED_FORMAT2( + ::testing::IsSubstring, + std::string(MemorySpace::name()) + " memory space failed to allocate", + msg) + << "memory space name is missing"; + ASSERT_PRED_FORMAT2(::testing::IsSubstring, + std::string("(label=\"") + label + "\")", msg) + << "label is missing"; + did_throw = true; + } + ASSERT_TRUE(did_throw); +} + +TEST(TEST_CATEGORY, view_bad_alloc) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = ExecutionSpace::memory_space; +#if defined(__has_feature) +#if __has_feature(address_sanitizer) + if (std::is_same_v) { + GTEST_SKIP() << "AddressSanitizer detects allocating too much memory " + "preventing our checks to run"; + } +#endif +#endif +#if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) + if (std::is_same_v) { + GTEST_SKIP() + << "ROCm 5.3 segfaults when trying to allocate too much memory"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same_v) { + GTEST_SKIP() << "acc_malloc() not properly returning nullptr"; + } +#endif + + test_view_bad_alloc(); + + constexpr bool execution_space_is_device = + std::is_same_v && + !std::is_same_v; + + if constexpr (execution_space_is_device) { + if constexpr (Kokkos::has_shared_space) { + test_view_bad_alloc(); + } + if constexpr (Kokkos::has_shared_host_pinned_space) { + test_view_bad_alloc(); + } + } +} + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestViewCopy_c.hpp b/lib/kokkos/core/unit_test/TestViewCopy_c.hpp new file mode 100644 index 00000000000..758af13c7df --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewCopy_c.hpp @@ -0,0 +1,434 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { +// Do not rely on deep_copy(0) as we want to test it! +template +void reset_view(const ExecSpace& space, ViewType& a, int magic) { + auto policy = Kokkos::RangePolicy(space, 0, a.span()); + + assert(a.span_is_contiguous()); + + Kokkos::parallel_for( + "TestViewCopy::ResetView", policy, + KOKKOS_LAMBDA(int i) {[i] = magic; }); +} + +template +size_t compute_overall_sum(const ExecSpace& space, ViewType& a) { + auto policy = Kokkos::RangePolicy(space, 0, a.span()); + + assert(a.span_is_contiguous()); + + typename ViewType::value_type sum = 0; + Kokkos::parallel_reduce( + "TestViewCopy::ComputeSum", policy, + KOKKOS_LAMBDA(int i, int& lcl_sum) { lcl_sum +=[i]; }, sum); + + return static_cast(sum); +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 0>* = nullptr) { + auto policy = Kokkos::RangePolicy(space, 0, 1); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank0", policy, + KOKKOS_LAMBDA(int, bool& local_check) { local_check &= (a() == magic); }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 1>* = nullptr) { + auto policy = Kokkos::RangePolicy(space, 0, a.extent(0)); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank1", policy, + KOKKOS_LAMBDA(int i, bool& local_check) { + local_check &= (a(i) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 2>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0}, {a.extent(0), a.extent(1)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank2", policy, + KOKKOS_LAMBDA(int i0, int i1, bool& local_check) { + local_check &= (a(i0, i1) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 3>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0}, {a.extent(0), a.extent(1), a.extent(2)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank3", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, bool& local_check) { + local_check &= (a(i0, i1, i2) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 4>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank4", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, bool& local_check) { + local_check &= (a(i0, i1, i2, i3) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 5>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank5", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 6>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank6", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 7>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set = true; + + for (size_t outer = 0; outer < a.extent(6); ++outer) { + bool all_local_elements_are_set; // Uninitialized, set by parallel_reduce + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank7", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5, outer) == magic); + }, + Kokkos::LAnd(all_local_elements_are_set)); + + all_elements_are_set = all_elements_are_set && all_local_elements_are_set; + } + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 8>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set = true; + + for (size_t outer = 0; outer < a.extent(7); ++outer) { + for (size_t inner = 0; inner < a.extent(6); ++inner) { + bool all_local_elements_are_set; // Uninitialized, set by parallel_reduce + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank8", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5, inner, outer) == magic); + }, + Kokkos::LAnd(all_local_elements_are_set)); + + all_elements_are_set = all_elements_are_set && all_local_elements_are_set; + } + } + return all_elements_are_set; +} + +template +bool view_fill_test(const ExecSpace& space, ViewType& a, int magic) { + Kokkos::deep_copy(space, a, magic); +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + // FIXME_OPENMPTARGET Does not work with Land reducer + return true; +#else // KOKKOS_ENABLE_OPENMPTARGET + return check_magic_value(space, a, magic); +#endif // KOKKOS_ENABLE_OPENMPTARGET +} + +template +void run_test() { + int magic = 19; + + using ViewType = Kokkos::View; + // Create views with different lengths for each dimension + // We want to test if all loops are over the correct dimensions + // We use prime numbers to make sure that the strides are different + ViewType a_decreasing("a", 23, 19, 17, 13, 11, 7, 5, 3); + // We also test with increasing strides to catch more "out-of-bounds" errors + // within subviews. + ViewType a_increasing("a", 3, 5, 7, 11, 13, 17, 19, 23); + + using exec_space = typename Space::execution_space; + auto space = exec_space(); + + // Use subviews in the tests to have cases with different ranks and + // non-contiguous memory + // Tests have two parts: + // 1. Fill the subview with a magic value and check that all elements are set + // 2. Check if only the subview is set by summing all elements in the view and + // comparing to the subview size times the magic value + + // Rank 0 + { + auto sub_dec = Kokkos::subview(a_decreasing, 0, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), + static_cast(magic)); + + auto sub_inc = Kokkos::subview(a_increasing, 0, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), + static_cast(magic)); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + + // Rank 1 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + + // Rank 2 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, 0, 0, + 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, 0, 0, + 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 3 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ( + compute_overall_sum(space, a_decreasing), + sub_dec.extent(0) * sub_dec.extent(1) * sub_dec.extent(2) * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 4 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), + sub_dec.extent(0) * sub_dec.extent(1) * sub_dec.extent(2) * + sub_dec.extent(3) * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 5 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 6 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 7 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 8 + { + auto sub_dec = Kokkos::subview( + a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, std::make_pair(0, 2)); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = Kokkos::subview( + a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, std::make_pair(0, 2)); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } +} + +TEST(TEST_CATEGORY, view_fill_tests_layout_right) { + using Space = TEST_EXECSPACE; + using Layout = Kokkos::LayoutRight; + run_test(); +} + +TEST(TEST_CATEGORY, view_fill_tests_layout_left) { + using Space = TEST_EXECSPACE; + using Layout = Kokkos::LayoutLeft; + run_test(); +} + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestViewLayoutTiled.hpp b/lib/kokkos/core/unit_test/TestViewLayoutTiled.hpp deleted file mode 100644 index 67308212ee0..00000000000 --- a/lib/kokkos/core/unit_test/TestViewLayoutTiled.hpp +++ /dev/null @@ -1,1756 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include - -#include -#include - -#include -#include - -namespace Test { - -namespace { - -template -struct TestViewLayoutTiled { - using Scalar = double; - - static constexpr int T0 = 2; - static constexpr int T1 = 4; - static constexpr int T2 = 4; - static constexpr int T3 = 2; - static constexpr int T4 = 2; - static constexpr int T5 = 2; - static constexpr int T6 = 2; - static constexpr int T7 = 2; - - // Rank 2 - using LayoutLL_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRL_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutLR_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRR_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - - // Rank 3 - using LayoutLL_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRL_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutLR_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRR_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - - // Rank 4 - using LayoutLL_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - using LayoutRL_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - using LayoutLR_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - using LayoutRR_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_2d(const int, const int) { -#else - static void test_view_layout_tiled_2d(const int N0, const int N1) { - const int FT = T0 * T1; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - // Test create_mirror_view, deep_copy - // Create LL View - { - using ViewType = - typename Kokkos::View; - ViewType v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - hv(ti * T0 + i, tj * T1 + j) = - (ti + tj * NT0) * FT + (i + j * T0); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti + tj * NT0) * FT + (i + j * T0) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } - - // Create RL View - { - using ViewType = - typename Kokkos::View; - Kokkos::View v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - hv(ti * T0 + i, tj * T1 + j) = - (ti * NT1 + tj) * FT + (i + j * T0); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti * NT1 + tj) * FT + (i + j * T0) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = - typename Kokkos::View; - Kokkos::View v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - hv(ti * T0 + i, tj * T1 + j) = - (ti + tj * NT0) * FT + (i * T1 + j); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti + tj * NT0) * FT + (i * T1 + j) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = - typename Kokkos::View; - Kokkos::View v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - hv(ti * T0 + i, tj * T1 + j) = - (ti * NT1 + tj) * FT + (i * T1 + j); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti * NT1 + tj) * FT + (i * T1 + j) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_2d - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_3d(const int, const int, const int) { -#else - static void test_view_layout_tiled_3d(const int N0, const int N1, - const int N2) { - const int FT = T0 * T1 * T2; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - - // Create LL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 RR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_3d - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_4d(const int, const int, const int, - const int){ -#else - static void test_view_layout_tiled_4d(const int N0, const int N1, - const int N2, const int N3) { - const int FT = T0 * T1 * T2 * T3; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - const int NT3 = int(std::ceil(N3 / T3)); - - // Create LL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + - tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 RR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_4d - - static void test_view_layout_tiled_subtile_2d(const int N0, const int N1) { - const int FT = T0 * T1; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - - // Counter to check for errors at the end - long counter[4] = {0}; - - // Create LL View - { - Kokkos::View v("v", N0, N1); - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i + j * T0); - } - } - } - } - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti + tj * NT0) * FT + (i + j * T0) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View v("v", N0, N1); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i + j * T0); - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti * NT1 + tj) * FT + (i + j * T0) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View v("v", N0, N1); - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i * T1 + j); - } - } - } - } - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti + tj * NT0) * FT + (i * T1 + j) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View v("v", N0, N1); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i * T1 + j); - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti * NT1 + tj) * FT + (i * T1 + j) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; - std::cout << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - } // end test_view_layout_tiled_subtile_2d - - static void test_view_layout_tiled_subtile_3d(const int N0, const int N1, - const int N2) { - const int FT = T0 * T1 * T2; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - - // Counter to check for errors at the end - long counter[4] = {0}; - // Create LL View - { - Kokkos::View v("v", N0, - N1, N2); - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View v("v", N0, - N1, N2); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View v("v", N0, - N1, N2); - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View v("v", N0, - N1, N2); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - - } // end test_view_layout_tiled_subtile_3d - - static void test_view_layout_tiled_subtile_4d(const int N0, const int N1, - const int N2, const int N3) { - const int FT = T0 * T1 * T2 * T3; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - const int NT3 = int(std::ceil(N3 / T3)); - - // Counter to check for errors at the end - long counter[4] = {0}; - // Create LL View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti + tj * NT0 + tk * N0 * N1 + - tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + - tl) * FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - - } // end test_view_layout_tiled_subtile_4d - -}; // end TestViewLayoutTiled struct - -} // namespace - -TEST(TEST_CATEGORY, view_layouttiled) { - // These two examples are iterating by tile, then within a tile - not by - // extents If N# is not a power of two, but want to iterate by tile then - // within a tile, need to check that mapped index is within extent - TestViewLayoutTiled::test_view_layout_tiled_2d(4, 12); - TestViewLayoutTiled::test_view_layout_tiled_3d(4, 12, 16); - TestViewLayoutTiled::test_view_layout_tiled_4d(4, 12, 16, 12); -} -TEST(TEST_CATEGORY, view_layouttiled_subtile) { - // These two examples are iterating by tile, then within a tile - not by - // extents If N# is not a power of two, but want to iterate by tile then - // within a tile, need to check that mapped index is within extent - TestViewLayoutTiled::test_view_layout_tiled_subtile_2d(4, 12); - TestViewLayoutTiled::test_view_layout_tiled_subtile_3d(4, 12, - 16); - TestViewLayoutTiled::test_view_layout_tiled_subtile_4d( - 4, 12, 16, 12); -} -} // namespace Test - -#undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/lib/kokkos/core/unit_test/TestViewOfViews.hpp b/lib/kokkos/core/unit_test/TestViewOfViews.hpp new file mode 100644 index 00000000000..a87c829bb73 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewOfViews.hpp @@ -0,0 +1,75 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +// User-defined type with a View data member +template +class S { + V v_; + + public: + template + S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + S() = default; +}; + +template +void test_view_of_views() { + using VoV = Kokkos::View; + { // assigning a default-constructed view to destruct the inner objects + VoV vov("vov", 2, 3); + V a("a"); + V b("b"); + vov(0, 0) = a; + vov(1, 0) = a; + vov(0, 1) = b; +#ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND + vov(0, 0) = V(); + vov(1, 0) = V(); + vov(0, 1) = V(); +#endif + } + { // using placement new to construct the inner objects and explicitly + // calling the destructor + VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3); + V a("a"); + V b("b"); + new (&vov(0, 0)) V(a); + new (&vov(1, 0)) V(a); + new (&vov(0, 1)) V(b); +#ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND + vov(0, 0).~V(); + vov(1, 0).~V(); + vov(0, 1).~V(); +#else + // leaks memory +#endif + } +} + +TEST(TEST_CATEGORY, view_of_views) { + test_view_of_views>(); + test_view_of_views>(); + // User-defined type with View data member + test_view_of_views>>(); +} + +} // namespace diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp index 386887d923e..c60aa2fe269 100644 --- a/lib/kokkos/core/unit_test/TestViewSubview.hpp +++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp @@ -2294,9 +2294,8 @@ template struct TestExtentsStaticTests { using test1 = typename static_expect_same< /* expected */ - Kokkos::Experimental::Extents, + Kokkos::Experimental::Extents, /* actual */ typename Kokkos::Impl::ParseViewExtents::type>::type; diff --git a/lib/kokkos/core/unit_test/UnitTest_ScopeGuard.cpp b/lib/kokkos/core/unit_test/UnitTest_ScopeGuard.cpp new file mode 100644 index 00000000000..b2176f3ef05 --- /dev/null +++ b/lib/kokkos/core/unit_test/UnitTest_ScopeGuard.cpp @@ -0,0 +1,155 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include + +namespace { + +/** + * Fixture that checks Kokkos is neither initialized nor finalized before and + * after the test. + */ +class AssertEnvironmentTest : public ::testing::Test { + protected: + void SetUp() override { + ASSERT_FALSE(Kokkos::is_initialized()); + ASSERT_FALSE(Kokkos::is_finalized()); + } + + void TearDown() override { + ASSERT_FALSE(Kokkos::is_initialized()); + ASSERT_FALSE(Kokkos::is_finalized()); + } +}; + +using scope_guard_DeathTest = AssertEnvironmentTest; + +/** + * Test to create a scope guard normally. + */ +TEST_F(scope_guard_DeathTest, create) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + // run it in a different process so side effects are not kept + EXPECT_EXIT( + { + { + Kokkos::ScopeGuard guard{}; + + if (!Kokkos::is_initialized()) std::exit(EXIT_FAILURE); + if (Kokkos::is_finalized()) std::exit(EXIT_FAILURE); + } + + if (Kokkos::is_initialized()) std::exit(EXIT_FAILURE); + if (!Kokkos::is_finalized()) std::exit(EXIT_FAILURE); + + std::exit(EXIT_SUCCESS); + }, + testing::ExitedWithCode(EXIT_SUCCESS), ""); +} + +/** + * Test to create a scope guard with an argument. + */ +TEST_F(scope_guard_DeathTest, create_argument) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + // run it in a different process so side effects are not kept + EXPECT_EXIT( + { + { + Kokkos::InitializationSettings settings{}; + Kokkos::ScopeGuard guard{settings}; + } + + std::exit(EXIT_SUCCESS); + }, + testing::ExitedWithCode(EXIT_SUCCESS), ""); +} + +/** + * Test to create another scope guard when one has been created. + */ +TEST_F(scope_guard_DeathTest, create_while_initialize) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH( + { + Kokkos::ScopeGuard guard1{}; + + // create a second scope guard while there is one already existing + Kokkos::ScopeGuard guard2{}; + }, + "Creating a ScopeGuard while Kokkos is initialized"); +} + +/** + * Test to create a scope guard when initialization has been done manually. + */ +TEST_F(scope_guard_DeathTest, create_after_initialize) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH( + { + Kokkos::initialize(); + + // create a scope guard after manual initialization + Kokkos::ScopeGuard guard{}; + }, + "Creating a ScopeGuard while Kokkos is initialized"); +} + +/** + * Test to create another scope guard when one has been destroyed. + */ +TEST_F(scope_guard_DeathTest, create_after_finalize) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH( + { + { Kokkos::ScopeGuard guard1{}; } + + // create a second scope guard while the first one has been destroyed + // already + Kokkos::ScopeGuard guard2{}; + }, + "Creating a ScopeGuard after Kokkos was finalized"); +} + +/** + * Test to destroy a scope guard when finalization has been done manually. + */ +TEST_F(scope_guard_DeathTest, destroy_after_finalize) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH( + { + // create a scope guard and finalize it manually + Kokkos::ScopeGuard guard{}; + Kokkos::finalize(); + }, + "Destroying a ScopeGuard after Kokkos was finalized"); +} + +/** + * Static tests + */ + +// Test scope guard is not copyable. +static_assert(!std::is_copy_assignable()); +static_assert(!std::is_copy_constructible()); + +// Test scope guard is not movable. +static_assert(!std::is_move_assignable()); +static_assert(!std::is_move_constructible()); + +} // namespace diff --git a/lib/kokkos/core/unit_test/category_files/TestHPX_Category.hpp b/lib/kokkos/core/unit_test/category_files/TestHPX_Category.hpp index d3a7cdbea53..c6a2aa9f201 100644 --- a/lib/kokkos/core/unit_test/category_files/TestHPX_Category.hpp +++ b/lib/kokkos/core/unit_test/category_files/TestHPX_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 3 #define TEST_CATEGORY_DEATH hpx_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::HPX +#define TEST_CATEGORY_FIXTURE(name) hpx_##name #endif diff --git a/lib/kokkos/core/unit_test/category_files/TestOpenACC_Category.hpp b/lib/kokkos/core/unit_test/category_files/TestOpenACC_Category.hpp index 0c4e4b7e119..6105eadf14f 100644 --- a/lib/kokkos/core/unit_test/category_files/TestOpenACC_Category.hpp +++ b/lib/kokkos/core/unit_test/category_files/TestOpenACC_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 8 #define TEST_CATEGORY_DEATH openacc_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenACC +#define TEST_CATEGORY_FIXTURE(name) openacc_##name #endif diff --git a/lib/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp b/lib/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp index 235b34ffab7..921cff78902 100644 --- a/lib/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp +++ b/lib/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 4 #define TEST_CATEGORY_DEATH openmptarget_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenMPTarget +#define TEST_CATEGORY_FIXTURE(name) openmptarget_##name #endif diff --git a/lib/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp b/lib/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp index 8e1b18c9acd..59e72c72c77 100644 --- a/lib/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp +++ b/lib/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 7 #define TEST_CATEGORY_DEATH sycl_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::SYCL +#define TEST_CATEGORY_FIXTURE(name) sycl_##name #endif diff --git a/lib/kokkos/core/unit_test/category_files/TestThreads_Category.hpp b/lib/kokkos/core/unit_test/category_files/TestThreads_Category.hpp index 13b0b653f21..ae8ac608339 100644 --- a/lib/kokkos/core/unit_test/category_files/TestThreads_Category.hpp +++ b/lib/kokkos/core/unit_test/category_files/TestThreads_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 1 #define TEST_CATEGORY_DEATH threads_DeathTest #define TEST_EXECSPACE Kokkos::Threads +#define TEST_CATEGORY_FIXTURE(name) threads_##name #endif diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp deleted file mode 100644 index 27203639690..00000000000 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp index d94735ceb23..40955e9c7ca 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -15,7 +15,7 @@ //@HEADER #include -#include +#include namespace { @@ -57,79 +57,6 @@ std::array get_execution_spaces( return {exec0, exec1}; } -// Test Interoperability with Cuda Streams -void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, - TEST_EXECSPACE exec, Kokkos::View v) { - using MemorySpace = typename TEST_EXECSPACE::memory_space; - - Kokkos::deep_copy(exec, v, 5); - Kokkos::deep_copy(exec0, v0, 5); - - Kokkos::deep_copy(v, v0); - - int sum; - int sum0; - - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0", - Kokkos::RangePolicy(exec0, 0, 100), - Test::FunctorRange(v0)); - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range", - Kokkos::RangePolicy(exec, 0, 100), - Test::FunctorRange(v)); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::RangeReduce_0", - Kokkos::RangePolicy>(exec0, - 0, 100), - Test::FunctorRangeReduce(v0), sum0); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::RangeReduce", - Kokkos::RangePolicy>(exec, 0, - 100), - Test::FunctorRangeReduce(v), sum); - ASSERT_EQ(600, sum0); - ASSERT_EQ(600, sum); - - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0", - Kokkos::MDRangePolicy>( - exec0, {0, 0}, {10, 10}), - Test::FunctorMDRange(v0)); - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange", - Kokkos::MDRangePolicy>( - exec, {0, 0}, {10, 10}), - Test::FunctorMDRange(v)); - Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0", - Kokkos::MDRangePolicy, - Kokkos::LaunchBounds<128, 2>>( - exec0, {0, 0}, {10, 10}), - Test::FunctorMDRangeReduce(v0), sum0); - Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce", - Kokkos::MDRangePolicy, - Kokkos::LaunchBounds<128, 2>>( - exec, {0, 0}, {10, 10}), - Test::FunctorMDRangeReduce(v), sum); - ASSERT_EQ(700, sum0); - ASSERT_EQ(700, sum); - - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0", - Kokkos::TeamPolicy(exec0, 10, 10), - Test::FunctorTeam(v0)); - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team", - Kokkos::TeamPolicy(exec, 10, 10), - Test::FunctorTeam(v)); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::Team_0", - Kokkos::TeamPolicy>(exec0, - 10, 10), - Test::FunctorTeamReduce(v0), sum0); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::Team", - Kokkos::TeamPolicy>(exec, 10, - 10), - Test::FunctorTeamReduce(v), sum); - ASSERT_EQ(800, sum0); - ASSERT_EQ(800, sum); -} - TEST(cuda_multi_gpu, managed_views) { StreamsAndDevices streams_and_devices; { @@ -169,93 +96,6 @@ TEST(cuda_multi_gpu, unmanaged_views) { } } -struct ScratchFunctor { - int scratch_size; - int R; - - ScratchFunctor(int scratch_size_, int R_) - : scratch_size(scratch_size_), R(R_) {} - - KOKKOS_FUNCTION - void operator()(const Kokkos::TeamPolicy::member_type &team, - int &error_accum) const { - Kokkos::View scratch_mem( - team.team_scratch(1), scratch_size); - - // Initialize scratch memory - Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), - [&](int i) { scratch_mem(i) = 0; }); - team.team_barrier(); - - // Increment each entry in scratch memory R times - for (int r = 0; r < R; ++r) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), - [&](int i) { scratch_mem(i) += 1; }); - } - team.team_barrier(); - - // Check that each scratch entry has been incremented exactly R times - int team_error_accum; - auto R_loc = R; // avoid implicit capture of this - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(team, 0, scratch_size), - [&](int i, int &tsum) { - if (scratch_mem(i) != R_loc) { - tsum += 1; - } - }, - team_error_accum); - Kokkos::single(Kokkos::PerTeam(team), - [&]() { error_accum += team_error_accum; }); - } -}; - -void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { - constexpr int N = 10; - constexpr int R = 1000; - constexpr int scratch_size = 100; - using ScratchType = Kokkos::View; - - // Test allocating and using scratch space - ScratchFunctor f(scratch_size, R); - - auto policy0 = - Kokkos::TeamPolicy(exec0, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); - auto policy1 = - Kokkos::TeamPolicy(exec1, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); - - int error0, error1; - - Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); - Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); - ASSERT_EQ(error0, 0); - ASSERT_EQ(error1, 0); - - // Request larger scratch size to trigger a realloc and test - const auto new_scratch_size = scratch_size + 10; - ScratchFunctor f_more_scratch(new_scratch_size, R); - - auto policy0_more_scratch = - Kokkos::TeamPolicy(exec0, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); - auto policy1_more_scratch = - Kokkos::TeamPolicy(exec1, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); - - Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, - f_more_scratch, error0); - Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, - f_more_scratch, error1); - ASSERT_EQ(error0, 0); - ASSERT_EQ(error1, 0); -} - TEST(cuda_multi_gpu, scratch_space) { StreamsAndDevices streams_and_devices; { diff --git a/lib/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt b/lib/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt index f792b03ed88..4c364ceee75 100644 --- a/lib/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt +++ b/lib/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt @@ -10,7 +10,8 @@ file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE ${BASE_DIR}/algorithms/src ${BASE_DIR}/algorithms/src/*.hpp) -if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) +# erroring out when deprecated code is disabled and raising warnings that are treated as errors in the CI otherwise +if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 OR Kokkos_ENABLE_DEPRECATION_WARNINGS) list(REMOVE_ITEM KOKKOS_CONTAINERS_HEADERS "Kokkos_Vector.hpp") endif() diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp index a213453ea18..8c72e9f2972 100644 --- a/lib/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp +++ b/lib/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp @@ -48,9 +48,6 @@ TEST(hip, memory_requirements) { // we want all user-facing memory in hip to be coarse grained. As of // today(07.01.22) the documentation is not reliable/correct, we test the // memory on the device and host - // FIXME_HIP - GTEST_SKIP() << "skipping the test because the CI on MI100 returns: error( " - "hipErrorInvalidValue)"; KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPHostPinnedSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPManagedSpace, int, 10); diff --git a/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp index d7b2a57b442..a7fa26c7282 100644 --- a/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp +++ b/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp @@ -63,7 +63,9 @@ struct TestIncrExecSpace { ASSERT_GT(concurrency, 0); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() int in_parallel = ExecSpace::in_parallel(); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() ASSERT_FALSE(in_parallel); #endif diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp deleted file mode 100644 index 22c8ab1bf8f..00000000000 --- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Graph.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Graph.cpp deleted file mode 100644 index bff64d83e27..00000000000 --- a/lib/kokkos/core/unit_test/serial/TestSerial_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 00000000000..d3906e409f5 --- /dev/null +++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,64 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +std::array get_execution_spaces() { + std::vector gpu_devices = + sycl::device::get_devices(sycl::info::device_type::gpu); + + TEST_EXECSPACE exec0( + sycl::queue{gpu_devices.front(), sycl::property::queue::in_order()}); + TEST_EXECSPACE exec1( + sycl::queue{gpu_devices.back(), sycl::property::queue::in_order()}); + + return {exec0, exec1}; +} + +TEST(sycl_multi_gpu, managed_views) { + std::array execs = get_execution_spaces(); + + Kokkos::View view0(Kokkos::view_alloc("v0", execs[0]), + 100); + Kokkos::View view(Kokkos::view_alloc("v", execs[1]), + 100); + + test_policies(execs[0], view0, execs[1], view); +} + +TEST(sycl_multi_gpu, unmanaged_views) { + std::array execs = get_execution_spaces(); + + int *p0 = sycl::malloc_device(100, execs[0].sycl_queue()); + Kokkos::View view0(p0, 100); + + int *p1 = sycl::malloc_device(100, execs[1].sycl_queue()); + Kokkos::View view1(p1, 100); + + test_policies(execs[0], view0, execs[1], view1); + sycl::free(p0, execs[0].sycl_queue()); + sycl::free(p1, execs[1].sycl_queue()); +} + +TEST(sycl_multi_gpu, scratch_space) { + std::array execs = get_execution_spaces(); + + test_scratch(execs[0], execs[1]); +} +} // namespace diff --git a/lib/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp b/lib/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp index b95890614e0..1b9b2a36819 100644 --- a/lib/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp +++ b/lib/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp @@ -23,15 +23,14 @@ namespace { // Helper to make static tests more succinct template -constexpr bool datatype_matches_extent = - std::is_same_v::type, - Extent>; +constexpr bool datatype_matches_extent = std::is_same_v< + typename Kokkos::Impl::ExtentsFromDataType::type, + Extent>; template constexpr bool extent_matches_datatype = - std::is_same_v::type>; + std::is_same_v::type>; // Conversion from DataType to extents // 0-rank view diff --git a/lib/kokkos/example/README b/lib/kokkos/example/README index 66860512448..2fe87276484 100644 --- a/lib/kokkos/example/README +++ b/lib/kokkos/example/README @@ -1,7 +1,7 @@ This directory contains example application proxies that use different parts of Kokkos. If you are looking for the FENL ("finite element -nonlinear" solve) example, it has moved into the LinAlg subpackage of -Tpetra. +nonlinear" solve) example, it has moved into the TrilinosCouplings +package in Trilinos. MANIFEST: diff --git a/lib/kokkos/example/build_cmake_installed/CMakeLists.txt b/lib/kokkos/example/build_cmake_installed/CMakeLists.txt index aaf745b418d..c025f1d7d28 100644 --- a/lib/kokkos/example/build_cmake_installed/CMakeLists.txt +++ b/lib/kokkos/example/build_cmake_installed/CMakeLists.txt @@ -12,6 +12,7 @@ find_package(Kokkos REQUIRED) add_executable(example cmake_example.cpp foo.f) if(CMAKE_Fortran_COMPILER_ID STREQUAL LLVMFlang) set_target_properties(example PROPERTIES LINKER_LANGUAGE Fortran) + target_link_options(example PRIVATE -fno-fortran-main) endif() # This is the only thing required to set up compiler/linker flags diff --git a/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp b/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp index 22b8b6d63c8..3104003fb48 100644 --- a/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp +++ b/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp @@ -16,7 +16,6 @@ #include #include -#include // // "Hello world" parallel_for example: @@ -25,12 +24,12 @@ // using a functor to define the loop body // 3. Shut down Kokkos // -// If Kokkos was built with C++11 enabled, try comparing this example -// to 01_hello_world_lambda. The latter uses C++11 lambdas (anonymous -// functions) to define the loop body of the parallel_for. That makes -// the code much more concise and readable. On the other hand, -// breaking out the loop body into an explicit functor makes it easier -// to test the loop independently of the parallel pattern. +// Try comparing this example to 01_hello_world_lambda, which uses +// C++11 lambdas (anonymous functions) to define the loop body of the +// parallel_for. That makes the code much more concise and readable. +// On the other hand, breaking out the loop body into an explicit +// functor makes it easier to test the loop independently of the +// parallel pattern. // // Functor that defines the parallel_for's loop body. @@ -72,11 +71,9 @@ int main(int argc, char* argv[]) { // start with "--kokkos-". Kokkos::initialize(argc, argv); - // Print the name of Kokkos' default execution space. We're using - // typeid here, so the name might get a bit mangled by the linker, - // but you should still be able to figure out what it is. + // Print the name of Kokkos' default execution space. printf("Hello World on Kokkos execution space %s\n", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); // Run the above functor on the default Kokkos execution space in // parallel, with a parallel for loop count of 15. diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp index 909765e1fc3..ad2c258c0fe 100644 --- a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp +++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp @@ -16,7 +16,6 @@ #include #include -#include // // "Hello world" parallel_for example: @@ -25,10 +24,9 @@ // using a C++11 lambda to define the loop body // 3. Shut down Kokkos // -// This example only builds if C++11 is enabled. Compare this example -// to 01_hello_world, which uses functors (explicitly defined classes) -// to define the loop body of the parallel_for. Both functors and -// lambdas have their places. +// Compare this example to 01_hello_world, which uses functors +// (explicitly defined classes) to define the loop body of the +// parallel_for. Both functors and lambdas have their places. // int main(int argc, char* argv[]) { @@ -41,11 +39,9 @@ int main(int argc, char* argv[]) { // start with "--kokkos-". Kokkos::initialize(argc, argv); - // Print the name of Kokkos' default execution space. We're using - // typeid here, so the name might get a bit mangled by the linker, - // but you should still be able to figure out what it is. + // Print the name of Kokkos' default execution space. printf("Hello World on Kokkos execution space %s\n", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); // Run lambda on the default Kokkos execution space in parallel, // with a parallel for loop count of 15. The lambda's argument is diff --git a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp index 5cae6da16cf..1ca30e07e88 100644 --- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp +++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp @@ -24,9 +24,8 @@ // using a C++11 lambda to define the loop body // 3. Shut down Kokkos // -// This example only builds if C++11 is enabled. Compare this example -// to 02_simple_reduce, which uses a functor to define the loop body -// of the parallel_reduce. +// Compare this example to 02_simple_reduce, which uses a functor to +// define the loop body of the parallel_reduce. // int main(int argc, char* argv[]) { diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash index 70dd61f9af0..25370daa3f2 100755 --- a/lib/kokkos/generate_makefile.bash +++ b/lib/kokkos/generate_makefile.bash @@ -164,7 +164,6 @@ display_help_text() { echo " AMD_GFX942 = AMD GPU MI300 GFX942" echo " AMD_GFX1030 = AMD GPU V620/W6800 GFX1030" echo " AMD_GFX1100 = AMD GPU RX 7900 XT(X) GFX1100" - echo " AMD_GFX1103 = AMD APU Radeon 740M/760M/780M/880M/890M GFX1103" echo " [ARM]" echo " ARMV80 = ARMv8.0 Compatible CPU" echo " ARMV81 = ARMv8.1 Compatible CPU" diff --git a/lib/kokkos/master_history.txt b/lib/kokkos/master_history.txt index 31be9253254..a0e83bef237 100644 --- a/lib/kokkos/master_history.txt +++ b/lib/kokkos/master_history.txt @@ -37,3 +37,4 @@ tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88 tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a tag: 4.3.00 date: 04:03:2024 master: e0dc0128 release: f08217a4 tag: 4.3.01 date: 05:07:2024 master: 486cc745 release: 262d2d6e +tag: 4.4.00 date: 08:08:2024 master: 6ecdf605 release: 6068673c diff --git a/lib/kokkos/simd/src/Kokkos_SIMD.hpp b/lib/kokkos/simd/src/Kokkos_SIMD.hpp index 57d4afd88be..5e34e51989c 100644 --- a/lib/kokkos/simd/src/Kokkos_SIMD.hpp +++ b/lib/kokkos/simd/src/Kokkos_SIMD.hpp @@ -183,15 +183,18 @@ template class data_types {}; #if defined(KOKKOS_ARCH_AVX512XEON) -using host_abi_set = abi_set>; +using host_abi_set = abi_set, + simd_abi::avx512_fixed_size<16>>; using data_type_set = data_types; #elif defined(KOKKOS_ARCH_AVX2) -using host_abi_set = abi_set>; +using host_abi_set = abi_set, + simd_abi::avx2_fixed_size<8>>; using data_type_set = data_types; #elif defined(KOKKOS_ARCH_ARM_NEON) -using host_abi_set = abi_set>; +using host_abi_set = abi_set, + simd_abi::neon_fixed_size<4>>; using data_type_set = data_types; #else diff --git a/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp b/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp index 6d0956f3832..27c8af79abd 100644 --- a/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp @@ -228,6 +228,106 @@ class simd_mask> { } }; +template <> +class simd_mask> { + __m256 m_value; + + public: + class reference { + __m256& m_mask; + int m_lane; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION __m256 bit_mask() const { + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + return _mm256_cvtepi32_ps(_mm256_setr_epi32( +#else + return _mm256_castsi256_ps(_mm256_setr_epi32( +#endif + -std::int32_t(m_lane == 0), -std::int32_t(m_lane == 1), + -std::int32_t(m_lane == 2), -std::int32_t(m_lane == 3), + -std::int32_t(m_lane == 4), -std::int32_t(m_lane == 5), + -std::int32_t(m_lane == 6), -std::int32_t(m_lane == 7))); + } + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(__m256& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + if (value) { + m_mask = _mm256_or_ps(bit_mask(), m_mask); + } else { + m_mask = _mm256_andnot_ps(bit_mask(), m_mask); + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + return (_mm256_movemask_ps(m_mask) & (1 << m_lane)) != 0; + } + }; + using value_type = bool; + using abi_type = simd_abi::avx2_fixed_size<8>; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) + : m_value(_mm256_castsi256_ps(_mm256_set1_epi32(-std::int32_t(value)))) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + G&& gen) noexcept + : m_value(_mm256_castsi256_ps(_mm256_setr_epi32( + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant()))))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 8; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + __m256 const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return static_cast( + reference(const_cast<__m256&>(m_value), int(i))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator||(simd_mask const& other) const { + return simd_mask(_mm256_or_ps(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator&&(simd_mask const& other) const { + return simd_mask(_mm256_and_ps(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator!() const { + auto const true_value = static_cast<__m256>(simd_mask(true)); + return simd_mask(_mm256_andnot_ps(m_value, true_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + simd_mask const& other) const { + return _mm256_movemask_ps(m_value) == _mm256_movemask_ps(other.m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + simd_mask const& other) const { + return !operator==(other); + } +}; + template <> class simd_mask> { __m128i m_value; @@ -324,6 +424,109 @@ class simd_mask> { } }; +template <> +class simd_mask> { + __m256i m_value; + + public: + class reference { + __m256i& m_mask; + int m_lane; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION __m256i bit_mask() const { + return _mm256_setr_epi32( + -std::int32_t(m_lane == 0), -std::int32_t(m_lane == 1), + -std::int32_t(m_lane == 2), -std::int32_t(m_lane == 3), + -std::int32_t(m_lane == 4), -std::int32_t(m_lane == 5), + -std::int32_t(m_lane == 6), -std::int32_t(m_lane == 7)); + } + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(__m256i& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + if (value) { + m_mask = _mm256_or_si256(bit_mask(), m_mask); + } else { + m_mask = _mm256_andnot_si256(bit_mask(), m_mask); + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + return (_mm256_movemask_ps(_mm256_castsi256_ps(m_mask)) & + (1 << m_lane)) != 0; + } + }; + using value_type = bool; + using abi_type = simd_abi::avx2_fixed_size<8>; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) + : m_value(_mm256_set1_epi32(-std::int32_t(value))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 8; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + __m256i const& value_in) + : m_value(value_in) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + G&& gen) noexcept + : m_value(_mm256_setr_epi32( + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())), + -std::int32_t(gen(std::integral_constant())))) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask( + simd_mask const& other) { + for (std::size_t i = 0; i < size(); ++i) (*this)[i] = other[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return static_cast( + reference(const_cast<__m256i&>(m_value), int(i))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator||(simd_mask const& other) const { + return simd_mask(_mm256_or_si256(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator&&(simd_mask const& other) const { + return simd_mask(_mm256_and_si256(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator!() const { + auto const true_value = static_cast<__m256i>(simd_mask(true)); + return simd_mask(_mm256_andnot_si256(m_value, true_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + simd_mask const& other) const { + return _mm256_movemask_ps(_mm256_castsi256_ps(m_value)) == + _mm256_movemask_ps(_mm256_castsi256_ps(other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + simd_mask const& other) const { + return !operator==(other); + } +}; + template <> class simd_mask> { __m256i m_value; @@ -800,11 +1003,11 @@ class simd> { KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 4; } - template , - bool> = false> + template , + bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) : m_value(_mm_set1_ps(value_type(value))) {} - template >, @@ -1031,12 +1234,12 @@ namespace Experimental { } template <> -class simd> { - __m128i m_value; +class simd> { + __m256 m_value; public: - using value_type = std::int32_t; - using abi_type = simd_abi::avx2_fixed_size<4>; + using value_type = float; + using abi_type = simd_abi::avx2_fixed_size<8>; using mask_type = simd_mask; using reference = value_type&; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; @@ -1045,29 +1248,30 @@ class simd> { KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { - return 4; + return 8; } template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm_set1_epi32(value_type(value))) {} + : m_value(_mm256_set1_ps(value_type(value))) {} template >, bool> = false> - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - G&& gen) noexcept - : m_value(_mm_setr_epi32(gen(std::integral_constant()), + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) + : m_value(_mm256_setr_ps(gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()), - gen(std::integral_constant()))) { + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - __m128i const& value_in) + __m256 const& value_in) : m_value(value_in) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( - simd const& other); KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -1077,93 +1281,350 @@ class simd> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used - // here. -#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE - m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr)); -#else - m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); -#endif + m_value = _mm256_loadu_ps(ptr); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, vector_aligned_tag) { - // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. -#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE - m_value = _mm_load_si128(reinterpret_cast<__m128i const*>(ptr)); -#else - m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); -#endif + m_value = _mm256_load_ps(ptr); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { - _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + _mm256_storeu_ps(ptr, m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, vector_aligned_tag) const { - _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + _mm256_store_ps(ptr, m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type - operator==(simd const& lhs, simd const& rhs) noexcept { - return mask_type( - _mm_cmpeq_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const + noexcept { + return simd(_mm256_sub_ps(_mm256_set1_ps(0.0), m_value)); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type - operator>(simd const& lhs, simd const& rhs) noexcept { - return mask_type( - _mm_cmpgt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_mul_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_div_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_add_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_sub_ps(lhs.m_value, rhs.m_value)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { - return mask_type( - _mm_cmplt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_LT_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_GT_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { - return (lhs < rhs) || (lhs == rhs); + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_LE_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { - return (lhs > rhs) || (lhs == rhs); + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_GE_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type - operator!=(simd const& lhs, simd const& rhs) noexcept { - return !(lhs == rhs); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( - simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm_sub_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( - simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( - simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm_mullo_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_EQ_OS)); } - - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, int rhs) noexcept { - return simd(_mm_srai_epi32(static_cast<__m128i>(lhs), rhs)); + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_NEQ_OS)); } +}; - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm_srav_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); - } +} // namespace Experimental - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, int rhs) noexcept { - return simd(_mm_slli_epi32(static_cast<__m128i>(lhs), rhs)); - } +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> +copysign( + Experimental::simd> const& + a, + Experimental::simd> const& + b) { + __m256 const sign_mask = _mm256_set1_ps(-0.0); + return Experimental::simd>( + _mm256_xor_ps(_mm256_andnot_ps(sign_mask, static_cast<__m256>(a)), + _mm256_and_ps(sign_mask, static_cast<__m256>(b)))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + __m256 const sign_mask = _mm256_set1_ps(-0.0); + return Experimental::simd>( + _mm256_andnot_ps(sign_mask, static_cast<__m256>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_round_ps(static_cast<__m256>(a), + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_round_ps(static_cast<__m256>(a), + (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_round_ps(static_cast<__m256>(a), + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_round_ps(static_cast<__m256>(a), + (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + sqrt(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_sqrt_ps(static_cast<__m256>(a))); +} + +#ifdef __INTEL_COMPILER + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + cbrt(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_cbrt_ps(static_cast<__m256>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + exp(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_exp_ps(static_cast<__m256>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + log(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_log_ps(static_cast<__m256>(a))); +} + +#endif + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> +fma(Experimental::simd> const& + a, + Experimental::simd> const& + b, + Experimental::simd> const& + c) { + return Experimental::simd>( + _mm256_fmadd_ps(static_cast<__m256>(a), static_cast<__m256>(b), + static_cast<__m256>(c))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> +max(Experimental::simd> const& + a, + Experimental::simd> const& + b) { + return Experimental::simd>( + _mm256_max_ps(static_cast<__m256>(a), static_cast<__m256>(b))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> +min(Experimental::simd> const& + a, + Experimental::simd> const& + b) { + return Experimental::simd>( + _mm256_min_ps(static_cast<__m256>(a), static_cast<__m256>(b))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>(_mm256_blendv_ps( + static_cast<__m256>(c), static_cast<__m256>(b), static_cast<__m256>(a))); +} + +template <> +class simd> { + __m128i m_value; + + public: + using value_type = std::int32_t; + using abi_type = simd_abi::avx2_fixed_size<4>; + using mask_type = simd_mask; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 4; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm_set1_epi32(value_type(value))) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept + : m_value(_mm_setr_epi32(gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) { + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m128i const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd const& other); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr)); +#else + m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm_load_si128(reinterpret_cast<__m128i const*>(ptr)); +#else + m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i() + const { + return m_value; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + _mm_cmpeq_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + _mm_cmpgt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + _mm_cmplt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return (lhs < rhs) || (lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return (lhs > rhs) || (lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return !(lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_sub_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_mullo_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(_mm_srai_epi32(static_cast<__m128i>(lhs), rhs)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_srav_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd(_mm_slli_epi32(static_cast<__m128i>(lhs), rhs)); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, simd const& rhs) noexcept { @@ -1229,6 +1690,207 @@ namespace Experimental { _mm_castsi128_ps(static_cast<__m128i>(a))))); } +template <> +class simd> { + __m256i m_value; + + public: + using value_type = std::int32_t; + using abi_type = simd_abi::avx2_fixed_size<8>; + using mask_type = simd_mask; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 8; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm256_set1_epi32(value_type(value))) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept + : m_value( + _mm256_setr_epi32(gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m256i const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi32(ptr, static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi32(ptr, static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm256_maskstore_epi32(ptr, static_cast<__m256i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_maskstore_epi32(ptr, static_cast<__m256i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() + const { + return m_value; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmpeq_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmpgt_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return !(lhs >= rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return (lhs < rhs) || (lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return (lhs > rhs) || (lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return !(lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(_mm256_srai_epi32(static_cast<__m256i>(lhs), rhs)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_srav_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + __m256i const rhs = static_cast<__m256i>(a); + return Experimental::simd>( + _mm256_abs_epi32(rhs)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_cvtepi32_ps(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_cvtepi32_ps(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_cvtepi32_ps(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm256_cvtepi32_ps(static_cast<__m256i>(a))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>(_mm256_castps_si256( + _mm256_blendv_ps(_mm256_castsi256_ps(static_cast<__m256i>(c)), + _mm256_castsi256_ps(static_cast<__m256i>(b)), + _mm256_castsi256_ps(static_cast<__m256i>(a))))); +} + template <> class simd> { __m256i m_value; @@ -1515,6 +2177,16 @@ class simd> { static_cast<__m256i>(mask_type(true))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; @@ -1821,6 +2493,94 @@ class where_expression>, } }; +template <> +class const_where_expression>, + simd>> { + public: + using abi_type = simd_abi::avx2_fixed_size<8>; + using value_type = simd; + using mask_type = simd_mask; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, element_aligned_tag) const { + _mm256_maskstore_ps(mem, _mm256_castps_si256(static_cast<__m256>(m_mask)), + static_cast<__m256>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm256_maskstore_ps(mem, _mm256_castps_si256(static_cast<__m256>(m_mask)), + static_cast<__m256>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + float* mem, + simd> const& index) const { + for (std::size_t lane = 0; lane < value_type::size(); ++lane) { + if (m_mask[lane]) mem[index[lane]] = m_value[lane]; + } + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression>, + simd>> + : public const_where_expression< + simd_mask>, + simd>> { + public: + where_expression( + simd_mask> const& mask_arg, + simd>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, element_aligned_tag) { + m_value = value_type(_mm256_maskload_ps( + mem, _mm256_castps_si256(static_cast<__m256>(m_mask)))); + } + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_maskload_ps( + mem, _mm256_castps_si256(static_cast<__m256>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + float const* mem, + simd> const& index) { + m_value = value_type(_mm256_mask_i32gather_ps( + static_cast<__m256>(m_value), mem, static_cast<__m256i>(index), + static_cast<__m256>(m_mask), 4)); + } + template >>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast>>( + std::forward(x)); + m_value = simd>(_mm256_blendv_ps( + static_cast<__m256>(m_value), static_cast<__m256>(x_as_value_type), + static_cast<__m256>(m_mask))); + } +}; + template <> class const_where_expression< simd_mask>, @@ -1923,6 +2683,109 @@ class where_expression>, } }; +template <> +class const_where_expression< + simd_mask>, + simd>> { + public: + using abi_type = simd_abi::avx2_fixed_size<8>; + using value_type = simd; + using mask_type = simd_mask; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, element_aligned_tag) const { + _mm256_maskstore_epi32(mem, static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm256_maskstore_epi32(mem, static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + std::int32_t* mem, + simd> const& index) const { + for (std::size_t lane = 0; lane < value_type::size(); ++lane) { + if (m_mask[lane]) mem[index[lane]] = m_value[lane]; + } + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression>, + simd>> + : public const_where_expression< + simd_mask>, + simd>> { + public: + where_expression( + simd_mask> const& mask_arg, + simd>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, element_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = + value_type(_mm256_maskload_epi32(mem, static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = + value_type(_mm256_maskload_epi32(mem, static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + std::int32_t const* mem, + simd> const& index) { + m_value = value_type(_mm256_mask_i32gather_epi32( + static_cast<__m256i>(m_value), mem, static_cast<__m256i>(index), + static_cast<__m256i>(m_mask), 4)); + } + template < + class U, + std::enable_if_t>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast>>( + std::forward(x)); + m_value = simd>( + _mm256_castps_si256(_mm256_blendv_ps( + _mm256_castsi256_ps(static_cast<__m256i>(m_value)), + _mm256_castsi256_ps(static_cast<__m256i>(x_as_value_type)), + _mm256_castsi256_ps(static_cast<__m256i>(m_mask))))); + } +}; + template <> class const_where_expression< simd_mask>, diff --git a/lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp b/lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp index 7fa35c204ae..84e8af3cd76 100644 --- a/lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp @@ -140,6 +140,122 @@ class simd_mask> { } }; +template +class simd_mask> { + __mmask16 m_value; + + public: + class reference { + __mmask16& m_mask; + int m_lane; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION __mmask16 bit_mask() const { + return __mmask16(std::int32_t(1 << m_lane)); + } + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(__mmask16& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + if (value) { + m_mask |= bit_mask(); + } else { + m_mask &= ~bit_mask(); + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + return (m_mask & bit_mask()) != 0; + } + }; + using value_type = bool; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) + : m_value(-std::int32_t(value)) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask( + simd_mask> const& other) + : m_value(static_cast<__mmask16>(other)) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(G&& gen) : m_value(false) { + reference(m_value, int(0)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(1)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(2)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(3)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(4)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(5)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(6)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(7)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(8)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(9)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(10)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(11)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(12)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(13)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(14)) = + static_cast(gen(std::integral_constant())); + reference(m_value, int(15)) = + static_cast(gen(std::integral_constant())); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 16; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + __mmask16 const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __mmask16() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + auto const bit_mask = __mmask16(std::int32_t(1 << i)); + return (m_value & bit_mask) != 0; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator||(simd_mask const& other) const { + return simd_mask(_kor_mask16(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator&&(simd_mask const& other) const { + return simd_mask(_kand_mask16(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator!() const { + static const __mmask16 true_value(static_cast<__mmask16>(simd_mask(true))); + return simd_mask(_kxor_mask16(true_value, m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + simd_mask const& other) const { + return m_value == other.m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + simd_mask const& other) const { + return m_value != other.m_value; + } +}; + template <> class simd> { __m512d m_value; @@ -700,6 +816,280 @@ simd> condition( static_cast<__m256>(b))); } +template <> +class simd> { + __m512 m_value; + + public: + using value_type = float; + using abi_type = simd_abi::avx512_fixed_size<16>; + using mask_type = simd_mask; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 16; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm512_set1_ps(value_type(value))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m512 const& value_in) + : m_value(value_in) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) + : m_value( + _mm512_setr_ps(gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm512_loadu_ps(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_ps(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm512_storeu_ps(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_ps(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512() + const { + return m_value; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const + noexcept { + return simd(_mm512_sub_ps(_mm512_set1_ps(0.0), m_value)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_mul_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_div_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_add_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_sub_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LT_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GT_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LE_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GE_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_EQ_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_NEQ_OS)); + } +}; + +} // namespace Experimental + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> +copysign(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a, + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& b) { + __m512 const sign_mask = _mm512_set1_ps(-0.0); + return Experimental::simd>( + _mm512_xor_ps(_mm512_andnot_ps(sign_mask, static_cast<__m512>(a)), + _mm512_and_ps(sign_mask, static_cast<__m512>(b)))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> abs( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const sign_mask = _mm512_set1_ps(-0.0); + return Experimental::simd>( + _mm512_andnot_ps(sign_mask, static_cast<__m512>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const val = static_cast<__m512>(a); + return Experimental::simd>( + _mm512_roundscale_ps(val, _MM_FROUND_TO_NEG_INF)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const val = static_cast<__m512>(a); + return Experimental::simd>( + _mm512_roundscale_ps(val, _MM_FROUND_TO_POS_INF)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const val = static_cast<__m512>(a); + return Experimental::simd>( + _mm512_roundscale_ps(val, _MM_FROUND_TO_NEAREST_INT)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const val = static_cast<__m512>(a); + return Experimental::simd>( + _mm512_roundscale_ps(val, _MM_FROUND_TO_ZERO)); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> sqrt( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_sqrt_ps(static_cast<__m512>(a))); +} + +#ifdef __INTEL_COMPILER + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> cbrt( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_cbrt_ps(static_cast<__m512>(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> exp( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_exp_ps(static_cast<__m512>(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> log( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_log_ps(static_cast<__m512>(a))); +} + +#endif + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> fma( + Experimental::simd> const& a, + Experimental::simd> const& b, + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& c) { + return Experimental::simd>( + _mm512_fmadd_ps(static_cast<__m512>(a), static_cast<__m512>(b), + static_cast<__m512>(c))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> max( + Experimental::simd> const& a, + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& b) { + return Experimental::simd>( + _mm512_max_ps(static_cast<__m512>(a), static_cast<__m512>(b))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> min( + Experimental::simd> const& a, + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& b) { + return Experimental::simd>( + _mm512_min_ps(static_cast<__m512>(a), static_cast<__m512>(b))); +} + +namespace Experimental { + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd> condition( + simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm512_mask_blend_ps(static_cast<__mmask16>(a), static_cast<__m512>(c), + static_cast<__m512>(b))); +} + template <> class simd> { __m256i m_value; @@ -908,12 +1298,12 @@ namespace Experimental { } template <> -class simd> { - __m256i m_value; +class simd> { + __m512i m_value; public: - using value_type = std::uint32_t; - using abi_type = simd_abi::avx512_fixed_size<8>; + using value_type = std::int32_t; + using abi_type = simd_abi::avx512_fixed_size<16>; using mask_type = simd_mask; using reference = value_type&; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; @@ -922,19 +1312,17 @@ class simd> { KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { - return 8; + return 16; } template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm256_set1_epi32( - Kokkos::bit_cast(value_type(value)))) {} + : m_value(_mm512_set1_epi32(value_type(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - __m256i const& value_in) + __m512i const& value_in) : m_value(value_in) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( - simd> const& other) - : m_value(static_cast<__m256i>(other)) {} + simd const& other); template > { bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( G&& gen) noexcept - : m_value( - _mm256_setr_epi32(gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), + : m_value(_mm512_setr_epi32( + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm512_mask_storeu_epi32(ptr, static_cast<__mmask16>(mask_type(true)), + m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_mask_store_epi32(ptr, static_cast<__mmask16>(mask_type(true)), + m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm512_mask_loadu_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_mask_load_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() + const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const + noexcept { + return simd(_mm512_sub_epi32(_mm512_set1_epi32(0), m_value)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_mullo_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd>( + _mm512_add_epi32(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd>( + _mm512_sub_epi32(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmplt_epi32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmplt_epi32_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmple_epi32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmple_epi32_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmpeq_epi32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmpneq_epi32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(_mm512_srai_epi32(static_cast<__m512i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_srav_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd(_mm512_slli_epi32(static_cast<__m512i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_sllv_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> +abs(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512i const rhs = static_cast<__m512i>(a); + return Experimental::simd>( + _mm512_abs_epi32(rhs)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> +floor(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_cvtepi32_ps(static_cast<__m512i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> +ceil(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_cvtepi32_ps(static_cast<__m512i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> +round(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_cvtepi32_ps(static_cast<__m512i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> +trunc(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_cvtepi32_ps(static_cast<__m512i>(a))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm512_mask_blend_epi32(static_cast<__mmask16>(a), + static_cast<__m512i>(c), + static_cast<__m512i>(b))); +} + +template <> +class simd> { + __m256i m_value; + + public: + using value_type = std::uint32_t; + using abi_type = simd_abi::avx512_fixed_size<8>; + using mask_type = simd_mask; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 8; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm256_set1_epi32( + Kokkos::bit_cast(value_type(value)))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m256i const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd> const& other) + : m_value(static_cast<__m256i>(other)) {} + template ()); } + std::is_invocable_r_v>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept + : m_value( + _mm256_setr_epi32(gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()))) {} @@ -960,6 +1566,16 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { m_value = _mm256_mask_loadu_epi32( @@ -970,142 +1586,344 @@ class simd> { m_value = _mm256_mask_load_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() + const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(rhs), + static_cast<__m256i>(lhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(rhs), + static_cast<__m256i>(lhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmpeq_epu32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmpneq_epu32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(_mm256_srli_epi32(static_cast<__m256i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_srlv_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> +abs(Experimental::simd> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +floor(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +ceil(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +round(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +trunc(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c), + static_cast<__m256i>(b))); +} + +template <> +class simd> { + __m512i m_value; + + public: + using value_type = std::uint32_t; + using abi_type = simd_abi::avx512_fixed_size<16>; + using mask_type = simd_mask; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 16; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm512_set1_epi32( + Kokkos::bit_cast(value_type(value)))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m512i const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd> const& other) + : m_value(static_cast<__m512i>(other)) {} + template ()); } + std::is_invocable_r_v>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept + : m_value(_mm512_setr_epi32( + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm512_mask_loadu_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_mask_load_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { - _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + _mm512_mask_storeu_epi32(ptr, static_cast<__mmask16>(mask_type(true)), m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, vector_aligned_tag) const { - _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + _mm512_mask_store_epi32(ptr, static_cast<__mmask16>(mask_type(true)), m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return simd(_mm512_mullo_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( simd const& lhs, simd const& rhs) noexcept { return simd( - _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + _mm512_add_epi32(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( simd const& lhs, simd const& rhs) noexcept { return simd( - _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + _mm512_sub_epi32(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm512_cmplt_epu32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(rhs), - static_cast<__m256i>(lhs))); + return mask_type(_mm512_cmplt_epu32_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm512_cmple_epu32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(rhs), - static_cast<__m256i>(lhs))); + return mask_type(_mm512_cmple_epu32_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmpeq_epu32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm512_cmpeq_epu32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator!=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmpneq_epu32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm512_cmpneq_epu32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { - return simd(_mm256_srli_epi32(static_cast<__m256i>(lhs), rhs)); + return simd(_mm512_srli_epi32(static_cast<__m512i>(lhs), rhs)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_srlv_epi32(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return simd(_mm512_srlv_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, int rhs) noexcept { - return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs)); + return simd(_mm512_slli_epi32(static_cast<__m512i>(lhs), rhs)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return simd(_mm512_sllv_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } }; } // namespace Experimental [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> -abs(Experimental::simd> const& a) { + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> +abs(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { return a; } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> + float, Experimental::simd_abi::avx512_fixed_size<16>> floor(Experimental::simd< - std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd>( - _mm512_cvtepu32_pd(static_cast<__m256i>(a))); + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_ps(static_cast<__m512i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> + float, Experimental::simd_abi::avx512_fixed_size<16>> ceil(Experimental::simd< - std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd>( - _mm512_cvtepu32_pd(static_cast<__m256i>(a))); + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_ps(static_cast<__m512i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> + float, Experimental::simd_abi::avx512_fixed_size<16>> round(Experimental::simd< - std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd>( - _mm512_cvtepu32_pd(static_cast<__m256i>(a))); + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_ps(static_cast<__m512i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> + float, Experimental::simd_abi::avx512_fixed_size<16>> trunc(Experimental::simd< - std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd>( - _mm512_cvtepu32_pd(static_cast<__m256i>(a))); + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_ps(static_cast<__m512i>(a))); } namespace Experimental { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - condition(simd_mask> const& a, - simd> const& b, - simd> const& c) { - return simd>( - _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c), - static_cast<__m256i>(b))); + simd> + condition( + simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm512_mask_blend_epi32(static_cast<__mmask16>(a), + static_cast<__m512i>(c), + static_cast<__m512i>(b))); } template <> @@ -1716,6 +2534,95 @@ class where_expression>, } }; +template <> +class const_where_expression>, + simd>> { + public: + using abi_type = simd_abi::avx512_fixed_size<16>; + using value_type = simd; + using mask_type = simd_mask; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, element_aligned_tag) const { + _mm512_mask_storeu_ps(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm512_mask_store_ps(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + float* mem, + simd> const& index) const { + _mm512_mask_i32scatter_ps(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), + static_cast<__m512>(m_value), 4); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression>, + simd>> + : public const_where_expression< + simd_mask>, + simd>> { + public: + where_expression( + simd_mask> const& mask_arg, + simd>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, element_aligned_tag) { + m_value = value_type(_mm512_mask_loadu_ps( + _mm512_set1_ps(0.0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_ps( + _mm512_set1_ps(0.0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + float const* mem, + simd> const& index) { + m_value = value_type(_mm512_mask_i32gather_ps( + static_cast<__m512>(m_value), static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), mem, 4)); + } + template >>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast>>( + std::forward(x)); + m_value = simd>(_mm512_mask_blend_ps( + static_cast<__mmask16>(m_mask), static_cast<__m512>(m_value), + static_cast<__m512>(x_as_value_type))); + } +}; + template <> class const_where_expression< simd_mask>, @@ -1810,6 +2717,98 @@ class where_expression>, } }; +template <> +class const_where_expression< + simd_mask>, + simd>> { + public: + using abi_type = simd_abi::avx512_fixed_size<16>; + using value_type = simd; + using mask_type = simd_mask; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, element_aligned_tag) const { + _mm512_mask_storeu_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + std::int32_t* mem, + simd> const& index) const { + _mm512_mask_i32scatter_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), + static_cast<__m512i>(m_value), 4); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression>, + simd>> + : public const_where_expression< + simd_mask>, + simd>> { + public: + where_expression( + simd_mask> const& mask_arg, + simd>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, element_aligned_tag) { + m_value = value_type(_mm512_mask_loadu_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + std::int32_t const* mem, + simd> const& index) { + m_value = value_type(_mm512_mask_i32gather_epi32( + static_cast<__m512i>(m_value), static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), mem, 4)); + } + template >>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast>>( + std::forward(x)); + m_value = simd>( + _mm512_mask_blend_epi32(static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value), + static_cast<__m512i>(x_as_value_type))); + } +}; + template <> class const_where_expression< simd_mask>, @@ -1905,6 +2904,99 @@ class where_expression>, } }; +template <> +class const_where_expression< + simd_mask>, + simd>> { + public: + using abi_type = simd_abi::avx512_fixed_size<16>; + using value_type = simd; + using mask_type = simd_mask; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint32_t* mem, element_aligned_tag) const { + _mm512_mask_storeu_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint32_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + std::uint32_t* mem, + simd> const& index) const { + _mm512_mask_i32scatter_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), + static_cast<__m512i>(m_value), 4); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression< + simd_mask>, + simd>> + : public const_where_expression< + simd_mask>, + simd>> { + public: + where_expression( + simd_mask> const& mask_arg, + simd>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint32_t const* mem, element_aligned_tag) { + m_value = value_type(_mm512_mask_loadu_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + std::uint32_t const* mem, + simd> const& index) { + m_value = value_type(_mm512_mask_i32gather_epi32( + static_cast<__m512i>(m_value), static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), mem, 4)); + } + template >>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast>>( + std::forward(x)); + m_value = simd>( + _mm512_mask_blend_epi32(static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value), + static_cast<__m512i>(x_as_value_type))); + } +}; + template <> class const_where_expression< simd_mask>, diff --git a/lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp b/lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp index efc81135d16..8cb0cc75fc0 100644 --- a/lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp +++ b/lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp @@ -42,11 +42,11 @@ class neon_fixed_size {}; namespace Impl { -template +template class neon_mask; template -class neon_mask { +class neon_mask { uint64x2_t m_value; public: @@ -104,12 +104,13 @@ class neon_mask { } template KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask( - neon_mask const& other) { + neon_mask const& other) { operator[](0) = bool(other[0]); operator[](1) = bool(other[1]); } template - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask const& other) + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask( + neon_mask const& other) : neon_mask(static_cast(other)) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 2; @@ -158,7 +159,7 @@ class neon_mask { }; template -class neon_mask { +class neon_mask { uint32x2_t m_value; public: @@ -211,10 +212,12 @@ class neon_mask { m_value, 1); } template - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask const& other) + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask( + neon_mask const& other) : m_value(vqmovn_u64(static_cast(other))) {} template - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask const& other) + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask( + neon_mask const& other) : m_value(static_cast(other)) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 2; @@ -260,14 +263,125 @@ class neon_mask { } }; +template +class neon_mask { + uint32x4_t m_value; + + public: + class reference { + uint32x4_t& m_mask; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(uint32x4_t& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + switch (m_lane) { + case 0: + m_mask = vsetq_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 0); + break; + case 1: + m_mask = vsetq_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 1); + break; + case 2: + m_mask = vsetq_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 2); + break; + case 3: + m_mask = vsetq_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 3); + break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + switch (m_lane) { + case 0: return vgetq_lane_u32(m_mask, 0) != 0; + case 1: return vgetq_lane_u32(m_mask, 1) != 0; + case 2: return vgetq_lane_u32(m_mask, 2) != 0; + case 3: return vgetq_lane_u32(m_mask, 3) != 0; + } + return false; + } + }; + using value_type = bool; + using abi_type = simd_abi::neon_fixed_size<4>; + using implementation_type = uint32x4_t; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit neon_mask(value_type value) + : m_value(vmovq_n_u32(value ? 0xFFFFFFFFU : 0)) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit neon_mask( + G&& gen) noexcept { + m_value = vsetq_lane_u32( + (gen(std::integral_constant()) ? 0xFFFFFFFFU : 0), + m_value, 0); + m_value = vsetq_lane_u32( + (gen(std::integral_constant()) ? 0xFFFFFFFFU : 0), + m_value, 1); + m_value = vsetq_lane_u32( + (gen(std::integral_constant()) ? 0xFFFFFFFFU : 0), + m_value, 2); + m_value = vsetq_lane_u32( + (gen(std::integral_constant()) ? 0xFFFFFFFFU : 0), + m_value, 3); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 4; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit neon_mask( + uint32x4_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint32x4_t() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return static_cast( + reference(const_cast(m_value), int(i))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived + operator||(neon_mask const& other) const { + return Derived(vorrq_u32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived + operator&&(neon_mask const& other) const { + return Derived(vandq_u32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived operator!() const { + auto const true_value = static_cast(neon_mask(true)); + return Derived(veorq_u32(m_value, true_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + neon_mask const& other) const { + uint32x4_t const elementwise_equality = vceqq_u32(m_value, other.m_value); + uint64x2_t const overall_equality_neon = + vreinterpretq_u64_u32(elementwise_equality); + return (overall_equality_neon[0] == 0xFFFFFFFFFFFFFFFFULL) && + (overall_equality_neon[1] == 0xFFFFFFFFFFFFFFFFULL); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + neon_mask const& other) const { + return !operator==(other); + } +}; + } // namespace Impl template class simd_mask> : public Impl::neon_mask>, - sizeof(T) * 8> { + sizeof(T) * 8, 2> { using base_type = Impl::neon_mask>, - sizeof(T) * 8>; + sizeof(T) * 8, 2>; public: using implementation_type = typename base_type::implementation_type; @@ -291,6 +405,35 @@ class simd_mask> : base_type(gen) {} }; +template +class simd_mask> + : public Impl::neon_mask>, + sizeof(T) * 8, 4> { + using base_type = Impl::neon_mask>, + sizeof(T) * 8, 4>; + + public: + using implementation_type = typename base_type::implementation_type; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(bool value) + : base_type(value) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask( + simd_mask> const& other) + : base_type(other) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + implementation_type const& value) + : base_type(value) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + G&& gen) noexcept + : base_type(gen) {} +}; + template <> class simd> { float64x2_t m_value; @@ -788,6 +931,256 @@ namespace Experimental { static_cast(c))); } +template <> +class simd> { + float32x4_t m_value; + + public: + using value_type = float; + using abi_type = simd_abi::neon_fixed_size<4>; + using mask_type = simd_mask; + class reference { + float32x4_t& m_value; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(float32x4_t& value_arg, + int lane_arg) + : m_value(value_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(float value) const { + switch (m_lane) { + case 0: m_value = vsetq_lane_f32(value, m_value, 0); break; + case 1: m_value = vsetq_lane_f32(value, m_value, 1); break; + case 2: m_value = vsetq_lane_f32(value, m_value, 2); break; + case 3: m_value = vsetq_lane_f32(value, m_value, 3); break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator float() const { + switch (m_lane) { + case 0: return vgetq_lane_f32(m_value, 0); + case 1: return vgetq_lane_f32(m_value, 1); + case 2: return vgetq_lane_f32(m_value, 2); + case 3: return vgetq_lane_f32(m_value, 3); + } + return 0; + } + }; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 4; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(vmovq_n_f32(value_type(value))) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) { + m_value = vsetq_lane_f32(gen(std::integral_constant()), + m_value, 0); + m_value = vsetq_lane_f32(gen(std::integral_constant()), + m_value, 1); + m_value = vsetq_lane_f32(gen(std::integral_constant()), + m_value, 2); + m_value = vsetq_lane_f32(gen(std::integral_constant()), + m_value, 3); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + float32x4_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reference(const_cast(this)->m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = vld1q_f32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_f32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_f32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_f32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit + operator float32x4_t() const { + return m_value; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const + noexcept { + return simd(vnegq_f32(m_value)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(vmulq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( + simd const& lhs, simd const& rhs) noexcept { + return simd(vdivq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd(vaddq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd(vsubq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vcltq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vcgtq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vcleq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vcgeq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vceqq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return !(lhs == rhs); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd>( + vabsq_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd>( + vrndmq_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd>( + vrndpq_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd>( + vrndxq_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd>( + vrndq_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> +copysign( + Experimental::simd> const& + a, + Experimental::simd> const& + b) { + uint32x4_t const sign_mask = vreinterpretq_u32_f32(vmovq_n_f32(-0.0)); + return Experimental::simd>( + vreinterpretq_f32_u32(vorrq_u32( + vreinterpretq_u32_f32(static_cast(abs(a))), + vandq_u32(sign_mask, + vreinterpretq_u32_f32(static_cast(b)))))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + sqrt(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd>( + vsqrtq_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> +fma(Experimental::simd> const& + a, + Experimental::simd> const& + b, + Experimental::simd> const& + c) { + return Experimental::simd>( + vfmaq_f32(static_cast(c), static_cast(b), + static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> +max(Experimental::simd> const& + a, + Experimental::simd> const& + b) { + return Experimental::simd>( + vmaxq_f32(static_cast(a), static_cast(b))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> +min(Experimental::simd> const& + a, + Experimental::simd> const& + b) { + return Experimental::simd>( + vminq_f32(static_cast(a), static_cast(b))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + vbslq_f32(static_cast(a), static_cast(b), + static_cast(c))); +} + template <> class simd> { int32x2_t m_value; @@ -1001,7 +1394,227 @@ namespace Experimental { } template <> -class simd> { +class simd> { + int32x4_t m_value; + + public: + using value_type = std::int32_t; + using abi_type = simd_abi::neon_fixed_size<4>; + using mask_type = simd_mask; + class reference { + int32x4_t& m_value; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(int32x4_t& value_arg, + int lane_arg) + : m_value(value_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(std::int32_t value) const { + switch (m_lane) { + case 0: m_value = vsetq_lane_s32(value, m_value, 0); break; + case 1: m_value = vsetq_lane_s32(value, m_value, 1); break; + case 2: m_value = vsetq_lane_s32(value, m_value, 2); break; + case 3: m_value = vsetq_lane_s32(value, m_value, 3); break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator std::int32_t() const { + switch (m_lane) { + case 0: return vgetq_lane_s32(m_value, 0); + case 1: return vgetq_lane_s32(m_value, 1); + case 2: return vgetq_lane_s32(m_value, 2); + case 3: return vgetq_lane_s32(m_value, 3); + } + return 0; + } + }; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 4; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(vmovq_n_s32(value_type(value))) {} + template >, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept { + m_value = vsetq_lane_s32(gen(std::integral_constant()), + m_value, 0); + m_value = vsetq_lane_s32(gen(std::integral_constant()), + m_value, 1); + m_value = vsetq_lane_s32(gen(std::integral_constant()), + m_value, 2); + m_value = vsetq_lane_s32(gen(std::integral_constant()), + m_value, 3); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + int32x4_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd const& other); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reference(const_cast(this)->m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = vld1q_s32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_s32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_s32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_s32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x4_t() + const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const + noexcept { + return simd(vnegq_s32(m_value)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vsubq_s32(static_cast(lhs), static_cast(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vaddq_s32(static_cast(lhs), static_cast(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vmulq_s32(static_cast(lhs), static_cast(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vceqq_s32(static_cast(lhs), static_cast(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vcgtq_s32(static_cast(lhs), static_cast(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vcltq_s32(static_cast(lhs), static_cast(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vcleq_s32(static_cast(lhs), static_cast(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vcgeq_s32(static_cast(lhs), static_cast(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return !(lhs == rhs); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(vshlq_s32(static_cast(lhs), + vnegq_s32(vmovq_n_s32(std::int32_t(rhs))))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd(vshlq_s32(static_cast(lhs), + vnegq_s32(static_cast(rhs)))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd( + vshlq_s32(static_cast(lhs), vmovq_n_s32(std::int32_t(rhs)))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vshlq_s32(static_cast(lhs), static_cast(rhs))); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd>( + vabsq_s32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return a; +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + vbslq_s32(static_cast(a), static_cast(b), + static_cast(c))); +} + +template <> +class simd> { int64x2_t m_value; public: @@ -1593,6 +2206,106 @@ class where_expression>, } }; +template <> +class const_where_expression>, + simd>> { + public: + using abi_type = simd_abi::neon_fixed_size<4>; + using value_type = simd; + using mask_type = simd_mask; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, element_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + if (m_mask[2]) mem[2] = m_value[2]; + if (m_mask[3]) mem[3] = m_value[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + if (m_mask[2]) mem[2] = m_value[2]; + if (m_mask[3]) mem[3] = m_value[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + float* mem, + simd> const& index) const { + if (m_mask[0]) mem[index[0]] = m_value[0]; + if (m_mask[1]) mem[index[1]] = m_value[1]; + if (m_mask[2]) mem[index[2]] = m_value[2]; + if (m_mask[3]) mem[index[3]] = m_value[3]; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression>, + simd>> + : public const_where_expression< + simd_mask>, + simd>> { + public: + where_expression( + simd_mask> const& mask_arg, + simd>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, element_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + if (m_mask[2]) m_value[2] = mem[2]; + if (m_mask[3]) m_value[3] = mem[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + if (m_mask[2]) m_value[2] = mem[2]; + if (m_mask[3]) m_value[3] = mem[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + float const* mem, + simd> const& index) { + if (m_mask[0]) m_value[0] = mem[index[0]]; + if (m_mask[1]) m_value[1] = mem[index[1]]; + if (m_mask[2]) m_value[2] = mem[index[2]]; + if (m_mask[3]) m_value[3] = mem[index[3]]; + } + template >>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast>>( + std::forward(x)); + m_value = static_cast>>( + vbslq_f32(static_cast(m_mask), + static_cast(x_as_value_type), + static_cast(m_value))); + } +}; + template <> class const_where_expression< simd_mask>, @@ -1686,6 +2399,108 @@ class where_expression>, } }; +template <> +class const_where_expression< + simd_mask>, + simd>> { + public: + using abi_type = simd_abi::neon_fixed_size<4>; + using value_type = simd; + using mask_type = simd_mask; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, element_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + if (m_mask[2]) mem[2] = m_value[2]; + if (m_mask[3]) mem[3] = m_value[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + if (m_mask[2]) mem[2] = m_value[2]; + if (m_mask[3]) mem[3] = m_value[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + std::int32_t* mem, + simd> const& index) const { + if (m_mask[0]) mem[index[0]] = m_value[0]; + if (m_mask[1]) mem[index[1]] = m_value[1]; + if (m_mask[2]) mem[index[2]] = m_value[2]; + if (m_mask[3]) mem[index[3]] = m_value[3]; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression>, + simd>> + : public const_where_expression< + simd_mask>, + simd>> { + public: + where_expression( + simd_mask> const& mask_arg, + simd>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, element_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + if (m_mask[2]) m_value[2] = mem[2]; + if (m_mask[3]) m_value[3] = mem[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + if (m_mask[2]) m_value[2] = mem[2]; + if (m_mask[3]) m_value[3] = mem[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + std::int32_t const* mem, + simd> const& index) { + if (m_mask[0]) m_value[0] = mem[index[0]]; + if (m_mask[1]) m_value[1] = mem[index[1]]; + if (m_mask[2]) m_value[2] = mem[index[2]]; + if (m_mask[3]) m_value[3] = mem[index[3]]; + } + template < + class U, + std::enable_if_t< + std::is_convertible_v>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast>>( + std::forward(x)); + m_value = static_cast>>( + vbslq_s32(static_cast(m_mask), + static_cast(x_as_value_type), + static_cast(m_value))); + } +}; + template <> class const_where_expression< simd_mask>, diff --git a/lib/kokkos/simd/unit_tests/CMakeLists.txt b/lib/kokkos/simd/unit_tests/CMakeLists.txt index 75d557e8b52..109effc710d 100644 --- a/lib/kokkos/simd/unit_tests/CMakeLists.txt +++ b/lib/kokkos/simd/unit_tests/CMakeLists.txt @@ -1,7 +1,9 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/simd/unit_tests/include) -KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SIMD - SOURCES - UnitTestMain.cpp - TestSIMD.cpp) +IF((NOT (Kokkos_ENABLE_CUDA AND WIN32))) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SIMD + SOURCES + UnitTestMain.cpp + TestSIMD.cpp) +ENDIF() diff --git a/lib/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp b/lib/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp index c587ccf3046..74141f25316 100644 --- a/lib/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/lib/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -81,7 +81,9 @@ class absolutes { auto on_host(T const& a) const { if constexpr (std::is_signed_v) { #if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() return Kokkos::Experimental::abs(a); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #else return Kokkos::abs(a); #endif diff --git a/lib/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp b/lib/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp index d36e1e5afc5..9719855f0ff 100644 --- a/lib/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp +++ b/lib/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp @@ -135,8 +135,8 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } + result = T(0); where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); - where(!mask, result) = 0; return true; } template @@ -181,4 +181,14 @@ class load_as_scalars { } }; +// Simple check to loosely test that T is a complete type. +// Some capabilities are only defined for specific data type and abi pairs (i.e. +// extended vector width); this is used to exclude pairs that +// are not defined from being tested. +template +constexpr bool is_type_v = false; + +template +constexpr bool is_type_v = true; + #endif diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp index f8d8cc70fa4..bf22cf3352b 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp @@ -22,21 +22,23 @@ template inline void host_check_condition() { - using simd_type = typename Kokkos::Experimental::simd; - using mask_type = typename simd_type::mask_type; - - auto condition_op = [](mask_type const& mask, simd_type const& a, - simd_type const& b) { - return Kokkos::Experimental::condition(mask, a, b); - }; - - simd_type value_a(16); - simd_type value_b(20); - - auto condition_result = condition_op(mask_type(false), value_a, value_b); - EXPECT_TRUE(all_of(condition_result == value_b)); - condition_result = condition_op(mask_type(true), value_a, value_b); - EXPECT_TRUE(all_of(condition_result == value_a)); + if constexpr (is_type_v>) { + using simd_type = typename Kokkos::Experimental::simd; + using mask_type = typename simd_type::mask_type; + + auto condition_op = [](mask_type const& mask, simd_type const& a, + simd_type const& b) { + return Kokkos::Experimental::condition(mask, a, b); + }; + + simd_type value_a(16); + simd_type value_b(20); + + auto condition_result = condition_op(mask_type(false), value_a, value_b); + EXPECT_TRUE(all_of(condition_result == value_b)); + condition_result = condition_op(mask_type(true), value_a, value_b); + EXPECT_TRUE(all_of(condition_result == value_a)); + } } template @@ -54,22 +56,24 @@ inline void host_check_condition_all_abis( template KOKKOS_INLINE_FUNCTION void device_check_condition() { - using simd_type = typename Kokkos::Experimental::simd; - using mask_type = typename simd_type::mask_type; - kokkos_checker checker; - - auto condition_op = [](mask_type const& mask, simd_type const& a, - simd_type const& b) { - return Kokkos::Experimental::condition(mask, a, b); - }; - - simd_type value_a(16); - simd_type value_b(20); - - auto condition_result = condition_op(mask_type(false), value_a, value_b); - checker.truth(all_of(condition_result == value_b)); - condition_result = condition_op(mask_type(true), value_a, value_b); - checker.truth(all_of(condition_result == value_a)); + if constexpr (is_type_v>) { + using simd_type = typename Kokkos::Experimental::simd; + using mask_type = typename simd_type::mask_type; + kokkos_checker checker; + + auto condition_op = [](mask_type const& mask, simd_type const& a, + simd_type const& b) { + return Kokkos::Experimental::condition(mask, a, b); + }; + + simd_type value_a(16); + simd_type value_b(20); + + auto condition_result = condition_op(mask_type(false), value_a, value_b); + checker.truth(all_of(condition_result == value_b)); + condition_result = condition_op(mask_type(true), value_a, value_b); + checker.truth(all_of(condition_result == value_a)); + } } template diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp index b98871bbab8..20b0729762c 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp @@ -22,40 +22,42 @@ template inline void host_check_conversions() { - { - auto a = Kokkos::Experimental::simd(1); - auto b = Kokkos::Experimental::simd(a); - EXPECT_TRUE(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd(1); - auto b = Kokkos::Experimental::simd(a); - EXPECT_TRUE(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd(1); - auto b = Kokkos::Experimental::simd(a); - EXPECT_TRUE(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd_mask(true); - auto b = Kokkos::Experimental::simd_mask(a); - EXPECT_TRUE(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask(true); - auto b = Kokkos::Experimental::simd_mask(a); - EXPECT_TRUE(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask(true); - auto b = Kokkos::Experimental::simd_mask(a); - EXPECT_TRUE(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask(true); - auto b = Kokkos::Experimental::simd_mask(a); - EXPECT_TRUE(b == decltype(b)(true)); + if constexpr (is_type_v>) { + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + EXPECT_TRUE(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + EXPECT_TRUE(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + EXPECT_TRUE(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + EXPECT_TRUE(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + EXPECT_TRUE(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + EXPECT_TRUE(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + EXPECT_TRUE(b == decltype(b)(true)); + } } } @@ -67,41 +69,43 @@ inline void host_check_conversions_all_abis( template KOKKOS_INLINE_FUNCTION void device_check_conversions() { - kokkos_checker checker; - { - auto a = Kokkos::Experimental::simd(1); - auto b = Kokkos::Experimental::simd(a); - checker.truth(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd(1); - auto b = Kokkos::Experimental::simd(a); - checker.truth(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd(1); - auto b = Kokkos::Experimental::simd(a); - checker.truth(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd_mask(true); - auto b = Kokkos::Experimental::simd_mask(a); - checker.truth(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask(true); - auto b = Kokkos::Experimental::simd_mask(a); - checker.truth(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask(true); - auto b = Kokkos::Experimental::simd_mask(a); - checker.truth(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask(true); - auto b = Kokkos::Experimental::simd_mask(a); - checker.truth(b == decltype(b)(true)); + if constexpr (is_type_v>) { + kokkos_checker checker; + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + checker.truth(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + checker.truth(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + checker.truth(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + checker.truth(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + checker.truth(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + checker.truth(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + checker.truth(b == decltype(b)(true)); + } } } diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index 23e3826c752..1a61fd9cbbb 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -22,49 +22,51 @@ template inline void host_check_gen_ctor() { - using simd_type = Kokkos::Experimental::simd; - using mask_type = typename simd_type::mask_type; - constexpr std::size_t lanes = simd_type::size(); - - DataType init[lanes]; - DataType expected[lanes]; - mask_type init_mask(false); - - for (std::size_t i = 0; i < lanes; ++i) { - if (i % 3 == 0) init_mask[i] = true; - init[i] = 7; - expected[i] = (init_mask[i]) ? init[i] * 9 : init[i]; - } - - simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); - - simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); + if constexpr (is_type_v>) { + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename simd_type::mask_type; + constexpr std::size_t lanes = simd_type::size(); -#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) - if constexpr (std::is_same_v) { - simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); - host_check_equality(basic, rhs, lanes); + DataType init[lanes]; + DataType expected[lanes]; + mask_type init_mask(false); - simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); - mask_type mask(KOKKOS_LAMBDA(std::size_t i) { return init_mask[i]; }); - simd_type result( - KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + for (std::size_t i = 0; i < lanes; ++i) { + if (i % 3 == 0) init_mask[i] = true; + init[i] = 7; + expected[i] = (init_mask[i]) ? init[i] * 9 : init[i]; + } - host_check_equality(blend, result, lanes); - } else { - simd_type basic([=](std::size_t i) { return init[i]; }); - host_check_equality(basic, rhs, lanes); + simd_type rhs; + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); - simd_type lhs([=](std::size_t i) { return init[i] * 9; }); - mask_type mask([=](std::size_t i) { return init_mask[i]; }); - simd_type result( - [=](std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + simd_type blend; + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); - host_check_equality(blend, result, lanes); - } +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) + if constexpr (std::is_same_v) { + simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); + host_check_equality(basic, rhs, lanes); + + simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); + mask_type mask(KOKKOS_LAMBDA(std::size_t i) { return init_mask[i]; }); + simd_type result( + KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + + host_check_equality(blend, result, lanes); + } else { + simd_type basic([=](std::size_t i) { return init[i]; }); + host_check_equality(basic, rhs, lanes); + + simd_type lhs([=](std::size_t i) { return init[i] * 9; }); + mask_type mask([=](std::size_t i) { return init_mask[i]; }); + simd_type result( + [=](std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + + host_check_equality(blend, result, lanes); + } #endif + } } template @@ -82,32 +84,34 @@ inline void host_check_gen_ctors_all_abis( template KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { - using simd_type = Kokkos::Experimental::simd; - using mask_type = typename simd_type::mask_type; - constexpr std::size_t lanes = simd_type::size(); - - DataType init[lanes]; - DataType expected[lanes]; - mask_type mask(false); - - for (std::size_t i = 0; i < lanes; ++i) { - if (i % 3 == 0) mask[i] = true; - init[i] = 7; - expected[i] = (mask[i]) ? init[i] * 9 : init[i]; - } + if constexpr (is_type_v>) { + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename simd_type::mask_type; + constexpr std::size_t lanes = simd_type::size(); + + DataType init[lanes]; + DataType expected[lanes]; + mask_type mask(false); + + for (std::size_t i = 0; i < lanes; ++i) { + if (i % 3 == 0) mask[i] = true; + init[i] = 7; + expected[i] = (mask[i]) ? init[i] * 9 : init[i]; + } - simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); - simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); - device_check_equality(basic, rhs, lanes); + simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); + simd_type rhs; + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); + device_check_equality(basic, rhs, lanes); - simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); - simd_type result( - KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); + simd_type result( + KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); - simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); - device_check_equality(result, blend, lanes); + simd_type blend; + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); + device_check_equality(result, blend, lanes); + } } template diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp index a93c52e9a8d..c3d4ac594d0 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp @@ -22,25 +22,27 @@ template inline void host_check_mask_ops() { - using mask_type = Kokkos::Experimental::simd_mask; - - EXPECT_FALSE(none_of(mask_type(true))); - EXPECT_TRUE(none_of(mask_type(false))); - EXPECT_TRUE(all_of(mask_type(true))); - EXPECT_FALSE(all_of(mask_type(false))); - EXPECT_TRUE(any_of(mask_type(true))); - EXPECT_FALSE(any_of(mask_type(false))); - - for (std::size_t i = 0; i < mask_type::size(); ++i) { - mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; }); - - EXPECT_TRUE(any_of(test_mask)); - EXPECT_FALSE(none_of(test_mask)); - - if constexpr (mask_type::size() > 1) { - EXPECT_FALSE(all_of(test_mask)); - } else { - EXPECT_TRUE(all_of(test_mask)); + if constexpr (is_type_v>) { + using mask_type = Kokkos::Experimental::simd_mask; + + EXPECT_FALSE(none_of(mask_type(true))); + EXPECT_TRUE(none_of(mask_type(false))); + EXPECT_TRUE(all_of(mask_type(true))); + EXPECT_FALSE(all_of(mask_type(false))); + EXPECT_TRUE(any_of(mask_type(true))); + EXPECT_FALSE(any_of(mask_type(false))); + + for (std::size_t i = 0; i < mask_type::size(); ++i) { + mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; }); + + EXPECT_TRUE(any_of(test_mask)); + EXPECT_FALSE(none_of(test_mask)); + + if constexpr (mask_type::size() > 1) { + EXPECT_FALSE(all_of(test_mask)); + } else { + EXPECT_TRUE(all_of(test_mask)); + } } } } @@ -60,25 +62,27 @@ inline void host_check_mask_ops_all_abis( template KOKKOS_INLINE_FUNCTION void device_check_mask_ops() { - using mask_type = Kokkos::Experimental::simd_mask; - kokkos_checker checker; - checker.truth(!none_of(mask_type(true))); - checker.truth(none_of(mask_type(false))); - checker.truth(all_of(mask_type(true))); - checker.truth(!all_of(mask_type(false))); - checker.truth(any_of(mask_type(true))); - checker.truth(!any_of(mask_type(false))); - - for (std::size_t i = 0; i < mask_type::size(); ++i) { - mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; }); - - checker.truth(any_of(test_mask)); - checker.truth(!none_of(test_mask)); - - if constexpr (mask_type::size() > 1) { - checker.truth(!all_of(test_mask)); - } else { - checker.truth(all_of(test_mask)); + if constexpr (is_type_v>) { + using mask_type = Kokkos::Experimental::simd_mask; + kokkos_checker checker; + checker.truth(!none_of(mask_type(true))); + checker.truth(none_of(mask_type(false))); + checker.truth(all_of(mask_type(true))); + checker.truth(!all_of(mask_type(false))); + checker.truth(any_of(mask_type(true))); + checker.truth(!any_of(mask_type(false))); + + for (std::size_t i = 0; i < mask_type::size(); ++i) { + mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; }); + + checker.truth(any_of(test_mask)); + checker.truth(!none_of(test_mask)); + + if constexpr (mask_type::size() > 1) { + checker.truth(!all_of(test_mask)); + } else { + checker.truth(all_of(test_mask)); + } } } } diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp index 59f2f6c18fd..4891a54f6c5 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp @@ -121,31 +121,34 @@ inline void host_check_abi_size() { template inline void host_check_math_ops() { - constexpr size_t n = 11; - constexpr size_t alignment = - Kokkos::Experimental::simd::size() * sizeof(DataType); - - host_check_abi_size(); - - if constexpr (!std::is_integral_v) { - alignas(alignment) DataType const first_args[n] = { - 0.1, 0.4, 0.5, 0.7, 1.0, 1.5, -2.0, 10.0, 0.0, 1.2, -2.8}; - alignas(alignment) DataType const second_args[n] = { - 1.0, 0.2, 1.1, 1.8, -0.1, -3.0, -2.4, 1.0, 13.0, -3.2, -2.1}; - host_check_all_math_ops(first_args, second_args); - } else { - if constexpr (std::is_signed_v) { - alignas(alignment) - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - alignas(alignment) DataType const second_args[n] = {1, 2, 1, 1, 1, -3, - -2, 1, 13, -3, -2}; + if constexpr (is_type_v>) { + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); + + host_check_abi_size(); + + if constexpr (!std::is_integral_v) { + alignas(alignment) DataType const first_args[] = { + 0.1, 0.4, 0.5, 0.7, 1.0, 1.5, -2.0, 10.0, + 0.0, 1.2, -2.8, 3.0, 4.0, -0.1, 5.0, -0.2}; + alignas(alignment) DataType const second_args[] = { + 1.0, 0.2, 1.1, 1.8, -0.1, -3.0, -2.4, 1.0, + 13.0, -3.2, -2.1, 3.0, -15.0, -0.5, -0.2, -0.2}; host_check_all_math_ops(first_args, second_args); } else { - alignas(alignment) - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - alignas(alignment) - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; - host_check_all_math_ops(first_args, second_args); + if constexpr (std::is_signed_v) { + alignas(alignment) DataType const first_args[] = { + 1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2, -3, 7, 4, -9, -15}; + alignas(alignment) DataType const second_args[] = { + 1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2, 10, -15, 7, 2, -10}; + host_check_all_math_ops(first_args, second_args); + } else { + alignas(alignment) DataType const first_args[] = { + 1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2, 11, 5, 8, 2, 14}; + alignas(alignment) DataType const second_args[] = { + 1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2, 3, 6, 20, 5, 14}; + host_check_all_math_ops(first_args, second_args); + } } } } @@ -253,25 +256,31 @@ KOKKOS_INLINE_FUNCTION void device_check_abi_size() { template KOKKOS_INLINE_FUNCTION void device_check_math_ops() { - constexpr size_t n = 11; - - device_check_abi_size(); - - if constexpr (!std::is_integral_v) { - DataType const first_args[n] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, - -2.0, 10.0, 0.0, 1.2, -2.8}; - DataType const second_args[n] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, - -2.4, 1.0, 13.0, -3.2, -2.1}; - device_check_all_math_ops(first_args, second_args); - } else { - if constexpr (std::is_signed_v) { - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + if constexpr (is_type_v>) { + device_check_abi_size(); + + if constexpr (!std::is_integral_v) { + DataType const first_args[] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, + -2.0, 10.0, 0.0, 1.2, -2.8, 3.0, + 4.0, -0.1, 5.0, -0.2}; + DataType const second_args[] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, + -2.4, 1.0, 13.0, -3.2, -2.1, 3.0, + -15.0, -0.5, -0.2, -0.2}; device_check_all_math_ops(first_args, second_args); } else { - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; - device_check_all_math_ops(first_args, second_args); + if constexpr (std::is_signed_v) { + DataType const first_args[] = {1, 2, -1, 10, 0, 1, -2, 10, + 0, 1, -2, -3, 7, 4, -9, -15}; + DataType const second_args[] = {1, 2, 1, 1, 1, -3, -2, 1, + 13, -3, -2, 10, -15, 7, 2, -10}; + device_check_all_math_ops(first_args, second_args); + } else { + DataType const first_args[] = {1, 2, 1, 10, 0, 1, 2, 10, + 0, 1, 2, 11, 5, 8, 2, 14}; + DataType const second_args[] = {1, 2, 1, 1, 1, 3, 2, 1, + 13, 3, 2, 3, 6, 20, 5, 14}; + device_check_all_math_ops(first_args, second_args); + } } } } diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp index b3c7ac9a01e..a3e796a0301 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp @@ -65,14 +65,18 @@ inline void host_check_all_reductions(const DataType (&args)[n]) { template inline void host_check_reductions() { - constexpr size_t n = 11; - - if constexpr (std::is_signed_v) { - DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - host_check_all_reductions(args); - } else { - DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - host_check_all_reductions(args); + if constexpr (is_type_v>) { + constexpr size_t n = 16; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, + 0, 1, -2, -15, 5, 17, -22, 20}; + host_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, + 0, 1, 2, 15, 5, 17, 22, 20}; + host_check_all_reductions(args); + } } } @@ -135,14 +139,18 @@ KOKKOS_INLINE_FUNCTION void device_check_all_reductions( template KOKKOS_INLINE_FUNCTION void device_check_reductions() { - constexpr size_t n = 11; - - if constexpr (std::is_signed_v) { - DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - device_check_all_reductions(args); - } else { - DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - device_check_all_reductions(args); + if constexpr (is_type_v>) { + constexpr size_t n = 16; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, + 0, 1, -2, -15, 5, 17, -22, 20}; + device_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, + 0, 1, 2, 15, 5, 17, 22, 20}; + device_check_all_reductions(args); + } } } diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp index ffdd2cba4a0..7329f085018 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp @@ -103,34 +103,35 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, template inline void host_check_shift_ops() { - if constexpr (std::is_integral_v) { - using simd_type = Kokkos::Experimental::simd; - constexpr std::size_t width = simd_type::size(); - constexpr std::size_t num_cases = 8; - constexpr size_t alignment = - Kokkos::Experimental::simd::size() * sizeof(DataType); - - DataType max = std::numeric_limits::max(); - - alignas(alignment) DataType shift_by[num_cases] = { - 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; - alignas(alignment) DataType test_vals[width]; - for (std::size_t i = 0; i < width; ++i) { - DataType inc = max / width; - test_vals[i] = i * inc + 1; - } - - host_check_shift_op_all_loaders(shift_right(), test_vals, shift_by, - num_cases); - host_check_shift_op_all_loaders(shift_left(), test_vals, shift_by, - num_cases); + if constexpr (is_type_v>) { + if constexpr (std::is_integral_v) { + using simd_type = Kokkos::Experimental::simd; + constexpr std::size_t width = simd_type::size(); + constexpr std::size_t num_cases = 16; + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); + + DataType max = std::numeric_limits::max(); + + alignas(alignment) DataType shift_by[num_cases] = { + 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1, + 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; + alignas(alignment) DataType test_vals[width]; + for (std::size_t i = 0; i < width; ++i) { + DataType inc = max / width; + test_vals[i] = i * inc + 1; + } - if constexpr (std::is_signed_v) { - for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1; host_check_shift_op_all_loaders(shift_right(), test_vals, shift_by, num_cases); host_check_shift_op_all_loaders(shift_left(), test_vals, shift_by, num_cases); + + if constexpr (std::is_signed_v) { + for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1; + host_check_shift_op_all_loaders(shift_right(), test_vals, shift_by, + num_cases); + } } } } @@ -224,33 +225,34 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( template KOKKOS_INLINE_FUNCTION void device_check_shift_ops() { - if constexpr (std::is_integral_v) { - using simd_type = Kokkos::Experimental::simd; - constexpr std::size_t width = simd_type::size(); - constexpr std::size_t num_cases = 8; + if constexpr (is_type_v>) { + if constexpr (std::is_integral_v) { + using simd_type = Kokkos::Experimental::simd; + constexpr std::size_t width = simd_type::size(); + constexpr std::size_t num_cases = 16; - DataType max = Kokkos::reduction_identity::max(); + DataType max = Kokkos::reduction_identity::max(); - DataType shift_by[num_cases] = { - 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; - DataType test_vals[width]; + DataType shift_by[num_cases] = { + 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1, + 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; + DataType test_vals[width]; - for (std::size_t i = 0; i < width; ++i) { - DataType inc = max / width; - test_vals[i] = i * inc + 1; - } + for (std::size_t i = 0; i < width; ++i) { + DataType inc = max / width; + test_vals[i] = i * inc + 1; + } - device_check_shift_op_all_loaders(shift_right(), test_vals, shift_by, - num_cases); - device_check_shift_op_all_loaders(shift_left(), test_vals, shift_by, - num_cases); - - if constexpr (std::is_signed_v) { - for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1; device_check_shift_op_all_loaders(shift_right(), test_vals, shift_by, num_cases); device_check_shift_op_all_loaders(shift_left(), test_vals, shift_by, num_cases); + + if constexpr (std::is_signed_v) { + for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1; + device_check_shift_op_all_loaders(shift_right(), test_vals, + shift_by, num_cases); + } } } } diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp index 152fd9e9840..904b2c665e5 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp @@ -22,60 +22,66 @@ template inline void host_check_where_expr_scatter_to() { - using simd_type = Kokkos::Experimental::simd; - using index_type = Kokkos::Experimental::simd; - using mask_type = typename simd_type::mask_type; - - std::size_t nlanes = simd_type::size(); - DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; - simd_type src; - src.copy_from(init, Kokkos::Experimental::simd_flag_default); - - for (std::size_t idx = 0; idx < nlanes; ++idx) { - mask_type mask(true); - mask[idx] = false; - - DataType dst[8] = {0}; - index_type index; - simd_type expected_result; - for (std::size_t i = 0; i < nlanes; ++i) { - dst[i] = (2 + (i * 2)); - index[i] = i; - expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + if constexpr (is_type_v>) { + using simd_type = Kokkos::Experimental::simd; + using index_type = Kokkos::Experimental::simd; + using mask_type = typename simd_type::mask_type; + + std::size_t nlanes = simd_type::size(); + DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37, + 53, 71, 79, 83, 89, 93, 97, 103}; + simd_type src; + src.copy_from(init, Kokkos::Experimental::simd_flag_default); + + for (std::size_t idx = 0; idx < nlanes; ++idx) { + mask_type mask(true); + mask[idx] = false; + + DataType dst[simd_type::size()] = {0}; + index_type index; + simd_type expected_result; + for (std::size_t i = 0; i < nlanes; ++i) { + dst[i] = (2 + (i * 2)); + index[i] = i; + expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + } + where(mask, src).scatter_to(dst, index); + + simd_type dst_simd; + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); + + host_check_equality(expected_result, dst_simd, nlanes); } - where(mask, src).scatter_to(dst, index); - - simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); - - host_check_equality(expected_result, dst_simd, nlanes); } } template inline void host_check_where_expr_gather_from() { - using simd_type = Kokkos::Experimental::simd; - using index_type = Kokkos::Experimental::simd; - using mask_type = typename simd_type::mask_type; - - std::size_t nlanes = simd_type::size(); - DataType src[] = {11, 13, 17, 19, 23, 29, 31, 37}; - - for (std::size_t idx = 0; idx < nlanes; ++idx) { - mask_type mask(true); - mask[idx] = false; - - simd_type dst; - index_type index; - simd_type expected_result; - for (std::size_t i = 0; i < nlanes; ++i) { - dst[i] = (2 + (i * 2)); - index[i] = i; - expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + if constexpr (is_type_v>) { + using simd_type = Kokkos::Experimental::simd; + using index_type = Kokkos::Experimental::simd; + using mask_type = typename simd_type::mask_type; + + std::size_t nlanes = simd_type::size(); + DataType src[] = {11, 13, 17, 19, 23, 29, 31, 37, + 53, 71, 79, 83, 89, 93, 97, 103}; + + for (std::size_t idx = 0; idx < nlanes; ++idx) { + mask_type mask(true); + mask[idx] = false; + + simd_type dst; + index_type index; + simd_type expected_result; + for (std::size_t i = 0; i < nlanes; ++i) { + dst[i] = (2 + (i * 2)); + index[i] = i; + expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + } + where(mask, dst).gather_from(src, index); + + host_check_equality(expected_result, dst, nlanes); } - where(mask, dst).gather_from(src, index); - - host_check_equality(expected_result, dst, nlanes); } } @@ -100,33 +106,36 @@ inline void host_check_where_expr_all_abis( template KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { - using simd_type = Kokkos::Experimental::simd; - using index_type = Kokkos::Experimental::simd; - using mask_type = typename simd_type::mask_type; - - std::size_t nlanes = simd_type::size(); - DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; - simd_type src; - src.copy_from(init, Kokkos::Experimental::simd_flag_default); - - for (std::size_t idx = 0; idx < nlanes; ++idx) { - mask_type mask(true); - mask[idx] = false; - - DataType dst[8] = {0}; - index_type index; - simd_type expected_result; - for (std::size_t i = 0; i < nlanes; ++i) { - dst[i] = (2 + (i * 2)); - index[i] = i; - expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + if constexpr (is_type_v>) { + using simd_type = Kokkos::Experimental::simd; + using index_type = Kokkos::Experimental::simd; + using mask_type = typename simd_type::mask_type; + + std::size_t nlanes = simd_type::size(); + DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37, + 53, 71, 79, 83, 89, 93, 97, 103}; + simd_type src; + src.copy_from(init, Kokkos::Experimental::simd_flag_default); + + for (std::size_t idx = 0; idx < nlanes; ++idx) { + mask_type mask(true); + mask[idx] = false; + + DataType dst[simd_type::size()] = {0}; + index_type index; + simd_type expected_result; + for (std::size_t i = 0; i < nlanes; ++i) { + dst[i] = (2 + (i * 2)); + index[i] = i; + expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + } + where(mask, src).scatter_to(dst, index); + + simd_type dst_simd; + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); + + device_check_equality(expected_result, dst_simd, nlanes); } - where(mask, src).scatter_to(dst, index); - - simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); - - device_check_equality(expected_result, dst_simd, nlanes); } } @@ -137,7 +146,8 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_gather_from() { using mask_type = typename simd_type::mask_type; std::size_t nlanes = simd_type::size(); - DataType src[] = {11, 13, 17, 19, 23, 29, 31, 37}; + DataType src[] = {11, 13, 17, 19, 23, 29, 31, 37, + 53, 71, 79, 83, 89, 93, 97, 103}; for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Adapt_HIP.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Adapt_HIP.hpp new file mode 100644 index 00000000000..0eab27fe989 --- /dev/null +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Adapt_HIP.hpp @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_ADAPT_HIP_HPP_ +#define DESUL_ATOMICS_ADAPT_HIP_HPP_ + +#include + +namespace desul { +namespace Impl { + +// FIXME same code as GCCMemoryOrder +template +struct HIPMemoryOrder; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELAXED; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQUIRE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELEASE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQ_REL; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_SEQ_CST; +}; + +// __HIP_MEMORY_SCOPE_SYSTEM +// __HIP_MEMORY_SCOPE_AGENT +// __HIP_MEMORY_SCOPE_WORKGROUP +// __HIP_MEMORY_SCOPE_WAVEFRONT +// __HIP_MEMORY_SCOPE_SINGLETHREAD +template +struct HIPMemoryScope; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_WORKGROUP; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_AGENT; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +} // namespace Impl +} // namespace desul + +#endif diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp index 3d69dcf6c50..e7f9239e03d 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp @@ -6,533 +6,95 @@ Source: SPDX-License-Identifier: (BSD-3-Clause) */ -#ifndef DESUL_ATOMIC_REF_IMPL_HPP_ -#define DESUL_ATOMIC_REF_IMPL_HPP_ +#ifndef DESUL_ATOMIC_REF_HPP_ +#define DESUL_ATOMIC_REF_HPP_ -#include #include #include #include -#include -#include namespace desul { -namespace Impl { -// TODO current implementation is missing the following: -// * member functions -// * wait -// * notify_one -// * notify_all - -template {}, - bool = std::is_floating_point{}> -struct basic_atomic_ref; - -// base class for non-integral, non-floating-point, non-pointer types -template -struct basic_atomic_ref { - static_assert(std::is_trivially_copyable{}, ""); - - private: - T* _ptr; - - // 1/2/4/8/16-byte types must be aligned to at least their size - static constexpr int _min_alignment = (sizeof(T) & (sizeof(T) - 1)) || sizeof(T) > 16 - ? 0 - : sizeof(T); - - public: - using value_type = T; - - static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T)); - - static constexpr std::size_t required_alignment = _min_alignment > alignof(T) - ? _min_alignment - : alignof(T); - - basic_atomic_ref() = delete; - basic_atomic_ref& operator=(basic_atomic_ref const&) = delete; - - basic_atomic_ref(basic_atomic_ref const&) = default; - - explicit basic_atomic_ref(T& obj) : _ptr(std::addressof(obj)) {} - - T operator=(T desired) const noexcept { - this->store(desired); - return desired; - } - - operator T() const noexcept { return this->load(); } - - template - DESUL_FUNCTION void store(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - atomic_store(_ptr, desired, order, MemoryScope()); - } - - template - DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, order, MemoryScope()); - } - - template - DESUL_FUNCTION T exchange(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, desired, order, MemoryScope()); - } - - DESUL_FUNCTION bool is_lock_free() const noexcept { - return atomic_is_lock_free(); - } - - template - DESUL_FUNCTION bool compare_exchange_weak(T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_weak( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_weak( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_weak(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_strong( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_strong(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } -}; - -// base class for atomic_ref -template -struct basic_atomic_ref { - static_assert(std::is_integral{}, ""); - - private: - T* _ptr; - - public: - using value_type = T; - using difference_type = value_type; - - static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T)); - - static constexpr std::size_t required_alignment = sizeof(T) > alignof(T) ? sizeof(T) - : alignof(T); - - basic_atomic_ref() = delete; - basic_atomic_ref& operator=(basic_atomic_ref const&) = delete; - - explicit basic_atomic_ref(T& obj) : _ptr(&obj) {} - - basic_atomic_ref(basic_atomic_ref const&) = default; - - T operator=(T desired) const noexcept { - this->store(desired); - return desired; - } - - operator T() const noexcept { return this->load(); } - - template - DESUL_FUNCTION void store(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - atomic_store(_ptr, desired, order, MemoryScope()); - } - - template - DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, order, MemoryScope()); - } - - template - DESUL_FUNCTION T exchange(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, desired, order, MemoryScope()); - } - - DESUL_FUNCTION bool is_lock_free() const noexcept { - return atomic_is_lock_free(); - } - - template - DESUL_FUNCTION bool compare_exchange_weak(T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_weak( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_weak( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_weak(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_strong( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_strong(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template - DESUL_FUNCTION value_type - fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_add(_ptr, arg, order, MemoryScope()); - } - - template - DESUL_FUNCTION value_type - fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_sub(_ptr, arg, order, MemoryScope()); - } - - template - DESUL_FUNCTION value_type - fetch_and(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_and(_ptr, arg, order, MemoryScope()); - } - - template - DESUL_FUNCTION value_type - fetch_or(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_or(_ptr, arg, order, MemoryScope()); - } - - template - DESUL_FUNCTION value_type - fetch_xor(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_xor(_ptr, arg, order, MemoryScope()); - } - - DESUL_FUNCTION value_type operator++() const noexcept { - return atomic_add_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); } - - DESUL_FUNCTION value_type operator--() const noexcept { - return atomic_sub_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); } - - DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept { - atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept { - atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator&=(value_type arg) const noexcept { - atomic_and_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator|=(value_type arg) const noexcept { - atomic_or_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator^=(value_type arg) const noexcept { - atomic_xor_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } -}; - -// base class for atomic_ref template -struct basic_atomic_ref { - static_assert(std::is_floating_point{}, ""); - - private: - T* _ptr; +class AtomicRef { + T* ptr_; public: using value_type = T; - using difference_type = value_type; - - static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T)); - - static constexpr std::size_t required_alignment = alignof(T); - - basic_atomic_ref() = delete; - basic_atomic_ref& operator=(basic_atomic_ref const&) = delete; - - explicit basic_atomic_ref(T& obj) : _ptr(&obj) {} - - basic_atomic_ref(basic_atomic_ref const&) = default; - - T operator=(T desired) const noexcept { - this->store(desired); - return desired; - } - - operator T() const noexcept { return this->load(); } - - template - DESUL_FUNCTION void store(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - atomic_store(_ptr, desired, order, MemoryScope()); - } - - template - DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, order, MemoryScope()); - } - - template - DESUL_FUNCTION T exchange(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, desired, order, MemoryScope()); - } - - DESUL_FUNCTION bool is_lock_free() const noexcept { - return atomic_is_lock_free(); - } - - template - DESUL_FUNCTION bool compare_exchange_weak(T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_weak( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_weak( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_weak(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_strong( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_strong(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template - DESUL_FUNCTION value_type - fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_add(_ptr, arg, order, MemoryScope()); - } - - template - DESUL_FUNCTION value_type - fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_sub(_ptr, arg, order, MemoryScope()); - } - - DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept { - atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept { - atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } -}; - -// base class for atomic_ref -template -struct basic_atomic_ref { - private: - T** _ptr; + using memory_order = MemoryOrder; + using memory_scope = MemoryScope; - public: - using value_type = T*; - using difference_type = std::ptrdiff_t; - - static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T)); - - static constexpr std::size_t required_alignment = alignof(T*); - - basic_atomic_ref() = delete; - basic_atomic_ref& operator=(basic_atomic_ref const&) = delete; - - explicit basic_atomic_ref(T*& arg) : _ptr(std::addressof(arg)) {} + DESUL_FUNCTION explicit AtomicRef(T& obj) : ptr_(&obj) {} - basic_atomic_ref(basic_atomic_ref const&) = default; - - T* operator=(T* desired) const noexcept { - this->store(desired); + DESUL_FUNCTION T operator=(T desired) const noexcept { + store(desired); return desired; } - operator T*() const noexcept { return this->load(); } - - template - DESUL_FUNCTION void store(T* desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - atomic_store(_ptr, desired, order, MemoryScope()); - } - - template - DESUL_FUNCTION T* load(_MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, order, MemoryScope()); - } - - template - DESUL_FUNCTION T* exchange(T* desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, desired, order, MemoryScope()); - } - - DESUL_FUNCTION bool is_lock_free() const noexcept { - return atomic_is_lock_free(); - } - - template - DESUL_FUNCTION bool compare_exchange_weak(T*& expected, - T* desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_weak( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_weak( - T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_weak(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_strong( - T*& expected, - T* desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_strong( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template - DESUL_FUNCTION bool compare_exchange_strong( - T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_strong(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } + DESUL_FUNCTION operator T() const noexcept { return load(); } - template - DESUL_FUNCTION value_type - fetch_add(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_add(_ptr, _type_size(d), order, MemoryScope()); + DESUL_FUNCTION T load() const noexcept { + return desul::atomic_load(ptr_, MemoryOrder(), MemoryScope()); } - template - DESUL_FUNCTION value_type - fetch_sub(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_sub(_ptr, _type_size(d), order, MemoryScope()); + DESUL_FUNCTION void store(T desired) const noexcept { + return desul::atomic_store(ptr_, desired, MemoryOrder(), MemoryScope()); } - DESUL_FUNCTION value_type operator++() const noexcept { - return atomic_add_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); } - - DESUL_FUNCTION value_type operator--() const noexcept { - return atomic_sub_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope()); + DESUL_FUNCTION T exchange(T desired) const noexcept { + return desul::atomic_exchange(ptr_, desired, MemoryOrder(), MemoryScope()); } - DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); } + // TODO compare_exchange_{weak,strong} and is_lock_free - DESUL_FUNCTION value_type operator+=(difference_type d) const noexcept { - atomic_add_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope()); +#define DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(FETCH_OP, OP_FETCH) \ + DESUL_FUNCTION T FETCH_OP(T arg) const noexcept { \ + return desul::atomic_##FETCH_OP(ptr_, arg, MemoryOrder(), MemoryScope()); \ + } \ + DESUL_FUNCTION T OP_FETCH(T arg) const noexcept { \ + return desul::atomic_##OP_FETCH(ptr_, arg, MemoryOrder(), MemoryScope()); \ } - DESUL_FUNCTION value_type operator-=(difference_type d) const noexcept { - atomic_sub_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope()); - } +#define DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(COMPD_ASGMT, OP_FETCH) \ + DESUL_FUNCTION T operator COMPD_ASGMT(T arg) const noexcept { return OP_FETCH(arg); } - private: - static constexpr std::ptrdiff_t _type_size(std::ptrdiff_t d) noexcept { - static_assert(std::is_object{}, ""); - return d * sizeof(T); - } -}; + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_add, add_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(+=, add_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_sub, sub_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(-=, sub_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_min, min_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_max, max_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_mul, mul_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(*=, mul_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_div, div_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(/=, div_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_mod, mod_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(%=, mod_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_and, and_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(&=, and_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_or, or_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(|=, or_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_xor, xor_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(^=, xor_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_nand, nand_fetch) -} // namespace Impl - -template -struct scoped_atomic_ref : Impl::basic_atomic_ref { - explicit scoped_atomic_ref(T& obj) noexcept - : Impl::basic_atomic_ref(obj) {} +#undef DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP +#undef DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP - scoped_atomic_ref& operator=(scoped_atomic_ref const&) = delete; +#define DESUL_IMPL_DEFINE_ATOMIC_INCREMENT_DECREMENT(OPER, NAME) \ + DESUL_FUNCTION T fetch_##NAME() const noexcept { \ + return desul::atomic_fetch_##NAME(ptr_, MemoryOrder(), MemoryScope()); \ + } \ + DESUL_FUNCTION T NAME##_fetch() const noexcept { \ + return desul::atomic_##NAME##_fetch(ptr_, MemoryOrder(), MemoryScope()); \ + } \ + DESUL_FUNCTION T operator OPER() const noexcept { return NAME##_fetch(); } \ + DESUL_FUNCTION T operator OPER(int) const noexcept { return fetch_##NAME(); } - scoped_atomic_ref(scoped_atomic_ref const&) = default; + DESUL_IMPL_DEFINE_ATOMIC_INCREMENT_DECREMENT(++, inc) + DESUL_IMPL_DEFINE_ATOMIC_INCREMENT_DECREMENT(--, dec) - using Impl::basic_atomic_ref::operator=; +#undef DESUL_IMPL_DEFINE_ATOMIC_INCREMENT_DECREMENT }; } // namespace desul diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp index 8c909bacdf4..0ade34f25df 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp @@ -9,6 +9,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ #define DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ +#include #include #include #include @@ -17,130 +18,40 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { namespace Impl { -template -__device__ std::enable_if_t device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicCAS(reinterpret_cast(dest), - reinterpret_cast(compare), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} -template -__device__ std::enable_if_t device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicCAS(reinterpret_cast(dest), - reinterpret_cast(compare), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} +template +struct atomic_exchange_available_hip { + constexpr static bool value = + ((sizeof(T) == 1 && alignof(T) == 1) || (sizeof(T) == 4 && alignof(T) == 4) || + (sizeof(T) == 8 && alignof(T) == 8)) && + std::is_trivially_copyable::value; +}; -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; -} - -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - return return_val; -} - -template -__device__ std::enable_if_t +template +__device__ std::enable_if_t::value, T> device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; + T* const dest, T compare, T value, MemoryOrder, MemoryScope) { + (void)__hip_atomic_compare_exchange_strong( + dest, + &compare, + value, + HIPMemoryOrder::value, + HIPMemoryOrder>::value, + HIPMemoryScope::value); + return compare; } -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicExch(reinterpret_cast(dest), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicExch(reinterpret_cast(dest), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T /*compare*/, T value, MemoryOrderAcquire, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderAcqRel, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); +template +__device__ std::enable_if_t::value, T> +device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope) { + T return_val = __hip_atomic_exchange(dest, + value, + HIPMemoryOrder::value, + HIPMemoryScope::value); return return_val; } template -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t::value, T> device_atomic_compare_exchange( T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front @@ -169,7 +80,7 @@ device_atomic_compare_exchange( } template -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t::value, T> device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front T return_val; diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp index 69ed8bcb9fd..68622758d8e 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp @@ -69,56 +69,56 @@ inline __device__ unsigned int device_atomic_fetch_inc_mod( unsigned int* inline __device__ unsigned int device_atomic_fetch_dec_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr, val); } // clang-format on -#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, TYPE) \ +#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, TYPE) \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ + __device__ TYPE device_atomic_##FETCH_OP( \ TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeDevice) { \ __threadfence(); \ TYPE return_val = \ - device_atomic_fetch_##OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ + device_atomic_##FETCH_OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ __threadfence(); \ return return_val; \ } \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ + __device__ TYPE device_atomic_##FETCH_OP( \ TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeCore) { \ - return device_atomic_fetch_##OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ + return device_atomic_##FETCH_OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ } -#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(OP) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, int) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, unsigned int) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, unsigned long long) +#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(FETCH_OP) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, int) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, unsigned int) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, unsigned long long) #ifdef DESUL_CUDA_ARCH_IS_PRE_PASCAL -#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, float) +#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(FETCH_OP) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, float) #else -#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, float) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, double) +#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(FETCH_OP) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, float) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, double) #endif -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(min) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(max) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(and) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(or) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(xor) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_min) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_max) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_and) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_or) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_xor) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(add) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(add) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(sub) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(sub) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(fetch_add) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_add) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(fetch_sub) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_sub) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(inc) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(dec) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_inc) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_dec) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(inc_mod, unsigned int) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(dec_mod, unsigned int) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(fetch_inc_mod, unsigned int) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(fetch_dec_mod, unsigned int) #undef DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT #undef DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_Generic.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_Generic.hpp index a94ff8ef187..530195a8327 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_Generic.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_Generic.hpp @@ -18,38 +18,38 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { namespace Impl { -#define DESUL_IMPL_ATOMIC_FETCH_OP(ANNOTATION, HOST_OR_DEVICE, OP) \ - template \ - ANNOTATION T HOST_OR_DEVICE##_atomic_fetch_##OP( \ - T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ - return HOST_OR_DEVICE##_atomic_fetch_oper( \ - OP##_operator(), dest, val, order, scope); \ - } \ - template \ - ANNOTATION T HOST_OR_DEVICE##_atomic_##OP##_fetch( \ - T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ - return HOST_OR_DEVICE##_atomic_oper_fetch( \ - OP##_operator(), dest, val, order, scope); \ +#define DESUL_IMPL_ATOMIC_FETCH_OP(ANNOTATION, HOST_OR_DEVICE, FETCH_OP, OP_FETCH) \ + template \ + ANNOTATION T HOST_OR_DEVICE##_atomic_##FETCH_OP( \ + T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ + return HOST_OR_DEVICE##_atomic_fetch_oper( \ + OP_FETCH##_operator(), dest, val, order, scope); \ + } \ + template \ + ANNOTATION T HOST_OR_DEVICE##_atomic_##OP_FETCH( \ + T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ + return HOST_OR_DEVICE##_atomic_oper_fetch( \ + OP_FETCH##_operator(), dest, val, order, scope); \ } -#define DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(OP) \ - DESUL_IMPL_ATOMIC_FETCH_OP(DESUL_IMPL_HOST_FUNCTION, host, OP) \ - DESUL_IMPL_ATOMIC_FETCH_OP(DESUL_IMPL_DEVICE_FUNCTION, device, OP) - -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(add) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(sub) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(max) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(min) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(mul) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(div) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(mod) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(and) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(or) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(xor) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(nand) - -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(inc_mod) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(dec_mod) +#define DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(FETCH_OP, OP_FETCH) \ + DESUL_IMPL_ATOMIC_FETCH_OP(DESUL_IMPL_HOST_FUNCTION, host, FETCH_OP, OP_FETCH) \ + DESUL_IMPL_ATOMIC_FETCH_OP(DESUL_IMPL_DEVICE_FUNCTION, device, FETCH_OP, OP_FETCH) + +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_add, add_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_sub, sub_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_max, max_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_min, min_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_mul, mul_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_div, div_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_mod, mod_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_and, and_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_or, or_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_xor, xor_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_nand, nand_fetch) + +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_inc_mod, inc_mod_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_dec_mod, dec_mod_fetch) #undef DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE #undef DESUL_IMPL_ATOMIC_FETCH_OP @@ -59,13 +59,13 @@ DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(dec_mod) ANNOTATION T HOST_OR_DEVICE##_atomic_fetch_##OP( \ T* const dest, const unsigned int val, MemoryOrder order, MemoryScope scope) { \ return HOST_OR_DEVICE##_atomic_fetch_oper( \ - OP##_operator(), dest, val, order, scope); \ + OP##_fetch_operator(), dest, val, order, scope); \ } \ template \ ANNOTATION T HOST_OR_DEVICE##_atomic_##OP##_fetch( \ T* const dest, const unsigned int val, MemoryOrder order, MemoryScope scope) { \ return HOST_OR_DEVICE##_atomic_oper_fetch( \ - OP##_operator(), dest, val, order, scope); \ + OP##_fetch_operator(), dest, val, order, scope); \ } #define DESUL_IMPL_ATOMIC_FETCH_OP_SHIFT_HOST_AND_DEVICE(OP) \ @@ -78,19 +78,21 @@ DESUL_IMPL_ATOMIC_FETCH_OP_SHIFT_HOST_AND_DEVICE(rshift) #undef DESUL_IMPL_ATOMIC_FETCH_OP_SHIFT_HOST_AND_DEVICE #undef DESUL_IMPL_ATOMIC_FETCH_OP_SHIFT -#define DESUL_IMPL_ATOMIC_LOAD_AND_STORE(ANNOTATION, HOST_OR_DEVICE) \ - template \ - ANNOTATION T HOST_OR_DEVICE##_atomic_load( \ - const T* const dest, MemoryOrder order, MemoryScope scope) { \ - return HOST_OR_DEVICE##_atomic_fetch_oper( \ - load_operator(), const_cast(dest), T(), order, scope); \ - } \ - \ - template \ - ANNOTATION void HOST_OR_DEVICE##_atomic_store( \ - T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ - (void)HOST_OR_DEVICE##_atomic_fetch_oper( \ - store_operator(), dest, val, order, scope); \ +// NOTE: using atomic_oper_fetch in the fallback implementation of atomic_store to avoid +// reading potentially uninitialized values which would yield undefined behavior. +#define DESUL_IMPL_ATOMIC_LOAD_AND_STORE(ANNOTATION, HOST_OR_DEVICE) \ + template \ + ANNOTATION T HOST_OR_DEVICE##_atomic_load( \ + const T* const dest, MemoryOrder order, MemoryScope scope) { \ + return HOST_OR_DEVICE##_atomic_fetch_oper( \ + load_fetch_operator(), const_cast(dest), T(), order, scope); \ + } \ + \ + template \ + ANNOTATION void HOST_OR_DEVICE##_atomic_store( \ + T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ + (void)HOST_OR_DEVICE##_atomic_oper_fetch( \ + store_fetch_operator(), dest, val, order, scope); \ } DESUL_IMPL_ATOMIC_LOAD_AND_STORE(DESUL_IMPL_HOST_FUNCTION, host) diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp index e9c749809de..8d9bd868250 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp @@ -9,99 +9,108 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_FECH_OP_HIP_HPP_ #define DESUL_ATOMICS_FECH_OP_HIP_HPP_ +#include + namespace desul { namespace Impl { -// clang-format off -inline __device__ int device_atomic_fetch_add( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_add( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_add(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ float device_atomic_fetch_add( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ double device_atomic_fetch_add( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } - -inline __device__ int device_atomic_fetch_sub( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_sub( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_sub(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ float device_atomic_fetch_sub( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ double device_atomic_fetch_sub( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } - -inline __device__ int device_atomic_fetch_min( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_min( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_min(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } - -inline __device__ int device_atomic_fetch_max( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_max( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_max(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } - -inline __device__ int device_atomic_fetch_and( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_and( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_and(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } - -inline __device__ int device_atomic_fetch_or ( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned int device_atomic_fetch_or ( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_or (unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, T) \ + template \ + __device__ inline T device_atomic_fetch_##OP( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_##OP(ptr, \ + val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_xor( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_xor( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_xor(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, long long) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned long long) + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, float) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, double) + +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(add) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(min) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(max) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(and) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(or) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(xor) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(add) +// atomic min/max gives the wrong results (tested with ROCm 6.0 on Frontier) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(min) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(max) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(T) \ + template \ + __device__ inline T device_atomic_fetch_sub( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_inc( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_inc( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_inc(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1ull); } +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(float) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(double) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_SUB + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC(T) \ + template \ + __device__ inline T device_atomic_fetch_inc(T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + 1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } \ + template \ + __device__ inline T device_atomic_fetch_dec(T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_dec( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_dec( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1 ); } +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned long long) -inline __device__ unsigned int device_atomic_fetch_inc_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicInc(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_dec_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr, val); } -// clang-format on +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, TYPE) \ +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MEMORY_SCOPE, MEMORY_SCOPE_STRING_LITERAL) \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeDevice) { \ - __threadfence(); \ - TYPE return_val = \ - device_atomic_fetch_##OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ - __threadfence(); \ - return return_val; \ + __device__ inline unsigned int device_atomic_fetch_inc_mod( \ + unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_inc32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeCore) { \ - return device_atomic_fetch_##OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ + __device__ inline unsigned int device_atomic_fetch_dec_mod( \ + unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_dec32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned long long) - -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, float) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, double) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(min) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(max) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(and) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(or) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(xor) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(sub) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(sub) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(inc) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(dec) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(inc_mod, unsigned int) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(dec_mod, unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeCore, "workgroup") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeDevice, "agent") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeNode, "") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeSystem, "") -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD } // namespace Impl } // namespace desul diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Operator_Function_Objects.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Operator_Function_Objects.hpp index be90cdbbd86..1f5159c4f8b 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Operator_Function_Objects.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Operator_Function_Objects.hpp @@ -18,7 +18,7 @@ namespace desul { namespace Impl { template -struct max_operator { +struct max_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return (val1 > val2 ? val1 : val2); @@ -30,7 +30,7 @@ struct max_operator { }; template -struct min_operator { +struct min_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return (val1 < val2 ? val1 : val2); @@ -70,55 +70,55 @@ constexpr DESUL_FUNCTION } template -struct add_operator { +struct add_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 + val2; } }; template -struct sub_operator { +struct sub_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 - val2; } }; template -struct mul_operator { +struct mul_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 * val2; } }; template -struct div_operator { +struct div_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 / val2; } }; template -struct mod_operator { +struct mod_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 % val2; } }; template -struct and_operator { +struct and_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 & val2; } }; template -struct or_operator { +struct or_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 | val2; } }; template -struct xor_operator { +struct xor_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 ^ val2; } }; template -struct nand_operator { +struct nand_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return ~(val1 & val2); @@ -126,7 +126,7 @@ struct nand_operator { }; template -struct lshift_operator { +struct lshift_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 << val2; @@ -134,7 +134,7 @@ struct lshift_operator { }; template -struct rshift_operator { +struct rshift_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 >> val2; @@ -142,7 +142,7 @@ struct rshift_operator { }; template -struct inc_mod_operator { +struct inc_mod_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return ((val1 >= val2) ? Scalar1(0) : val1 + Scalar1(1)); @@ -150,7 +150,7 @@ struct inc_mod_operator { }; template -struct dec_mod_operator { +struct dec_mod_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return (((val1 == Scalar1(0)) | (val1 > val2)) ? val2 : (val1 - Scalar1(1))); @@ -158,13 +158,13 @@ struct dec_mod_operator { }; template -struct store_operator { +struct store_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1&, const Scalar2& val2) { return val2; } }; template -struct load_operator { +struct load_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2&) { return val1; } }; diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp index 8e42a37ba7c..24166462e7a 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp @@ -205,7 +205,7 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or #endif #ifndef _MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION -# if (!defined(__NVCC__) || (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) && \ +# if (!defined(__NVCC__) || (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10 >= 1170)) && \ ((defined(__cpp_deduction_guides) && __cpp_deduction_guides >= 201703) || \ (!defined(__cpp_deduction_guides) && MDSPAN_HAS_CXX_17)) # define _MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION 1 diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp index 9a28c3ed5ca..d58d37732dd 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp @@ -16,12 +16,15 @@ #pragma once #include "dynamic_extent.hpp" +#include "utility.hpp" #ifdef __cpp_lib_span #include #endif #include +#include +#include #include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -30,6 +33,7 @@ namespace detail { // Function used to check compatibility of extents in converting constructor // can't be a private member function for some reason. template +MDSPAN_INLINE_FUNCTION static constexpr std::integral_constant __check_compatible_extents( std::integral_constant, std::integer_sequence, @@ -46,6 +50,7 @@ struct __compare_extent_compatible : std::integral_constant +MDSPAN_INLINE_FUNCTION static constexpr std::integral_constant< bool, _MDSPAN_FOLD_AND(__compare_extent_compatible::value)> __check_compatible_extents( @@ -59,8 +64,8 @@ template MDSPAN_INLINE_FUNCTION static constexpr bool are_valid_indices() { return - (std::is_convertible::value && ... && true) && - (std::is_nothrow_constructible::value && ... && true); + _MDSPAN_FOLD_AND(std::is_convertible::value) && + _MDSPAN_FOLD_AND(std::is_nothrow_constructible::value); } // ------------------------------------------------------------------ @@ -538,14 +543,9 @@ template class extents { MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const extents &lhs, const extents &rhs) noexcept { - if constexpr (rank() != extents::rank()) { - return false; - } else { - using common_t = std::common_type_t; - for (size_type r = 0; r < m_rank; r++) - if(static_cast(rhs.extent(r)) != static_cast(lhs.extent(r))) return false; - } - return true; + return + rank() == extents::rank() && + detail::rankwise_equal(detail::with_rank{}, rhs, lhs, detail::extent); } #if !(MDSPAN_HAS_CXX_20) @@ -614,5 +614,80 @@ static #endif constexpr bool __is_extents_v = __is_extents::value; +template +MDSPAN_INLINE_FUNCTION +constexpr void +check_lower_bound(InputIndexType user_index, + ExtentsIndexType /* current_extent */, + std::true_type /* is_signed */) +{ + (void) user_index; // prevent unused variable warning +#ifdef _MDSPAN_DEBUG + assert(static_cast(user_index) >= 0); +#endif +} + +template +MDSPAN_INLINE_FUNCTION +constexpr void +check_lower_bound(InputIndexType /* user_index */, + ExtentsIndexType /* current_extent */, + std::false_type /* is_signed */) +{} + +template +MDSPAN_INLINE_FUNCTION +constexpr void +check_upper_bound(InputIndexType user_index, + ExtentsIndexType current_extent) +{ + (void) user_index; // prevent unused variable warnings + (void) current_extent; +#ifdef _MDSPAN_DEBUG + assert(static_cast(user_index) < current_extent); +#endif +} + +// Returning true to use AND fold instead of comma +// CPP14 mode doesn't like the use of void expressions +// with the way the _MDSPAN_FOLD_AND is set up +template +MDSPAN_INLINE_FUNCTION +constexpr bool +check_one_index(InputIndex user_index, + ExtentsIndexType current_extent) +{ + check_lower_bound(user_index, current_extent, + std::integral_constant::value>{}); + check_upper_bound(user_index, current_extent); + return true; +} + +template +MDSPAN_INLINE_FUNCTION +constexpr void +check_all_indices_helper(std::index_sequence, + const extents& exts, + Indices... indices) +{ + // Suppress warning about statement has no effect + (void) _MDSPAN_FOLD_AND( + (check_one_index(indices, exts.extent(RankIndices))) + ); +} + +template +MDSPAN_INLINE_FUNCTION +constexpr void +check_all_indices(const extents& exts, + Indices... indices) +{ + check_all_indices_helper(std::make_index_sequence(), + exts, indices...); +} + } // namespace detail } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp index 83ed9ef7fe3..222fba7aa04 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp @@ -18,8 +18,11 @@ #include "macros.hpp" #include "trait_backports.hpp" #include "extents.hpp" +#include "layout_stride.hpp" +#include "utility.hpp" +#if MDSPAN_HAS_CXX_17 #include "../__p2642_bits/layout_padded_fwd.hpp" -#include +#endif #include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -133,11 +136,11 @@ class layout_left::mapping { : __extents(__other.extents()) { MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: - check_padded_layout_converting_constructor_mandates(); + check_padded_layout_converting_constructor_mandates< + extents_type, _Mapping>(detail::with_rank{}); MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: check_padded_layout_converting_constructor_preconditions< - extents_type>(__other); + extents_type>(detail::with_rank{}, __other); } #endif @@ -156,17 +159,7 @@ class layout_left::mapping { * TODO: check precondition * other.required_span_size() is a representable value of type index_type */ - #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - if constexpr (extents_type::rank() > 0) { - index_type stride = 1; - using common_t = std::common_type_t; - for(rank_type r=0; r<__extents.rank(); r++) { - if(static_cast(stride) != static_cast(other.stride(r))) - std::abort(); // ("Assigning layout_stride to layout_left with invalid strides."); - stride *= __extents.extent(r); - } - } - #endif + detail::validate_strides(detail::with_rank{}, layout_left{}, __extents, other); } MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED mapping& operator=(mapping const&) noexcept = default; @@ -194,6 +187,9 @@ class layout_left::mapping { ) _MDSPAN_HOST_DEVICE constexpr index_type operator()(Indices... idxs) const noexcept { +#if ! defined(NDEBUG) + detail::check_all_indices(this->extents(), idxs...); +#endif // ! NDEBUG return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast(idxs)...); } diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp index 3d3927df7bc..284569f6533 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp @@ -18,9 +18,11 @@ #include "macros.hpp" #include "trait_backports.hpp" #include "extents.hpp" -#include #include "layout_stride.hpp" +#include "utility.hpp" +#if MDSPAN_HAS_CXX_17 #include "../__p2642_bits/layout_padded_fwd.hpp" +#endif namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -134,11 +136,11 @@ class layout_right::mapping { : __extents(__other.extents()) { MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: - check_padded_layout_converting_constructor_mandates(); + check_padded_layout_converting_constructor_mandates< + extents_type, _Mapping>(detail::with_rank{}); MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: check_padded_layout_converting_constructor_preconditions< - extents_type>(__other); + extents_type>(detail::with_rank{}, __other); } #endif @@ -157,17 +159,7 @@ class layout_right::mapping { * TODO: check precondition * other.required_span_size() is a representable value of type index_type */ - #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - if constexpr (extents_type::rank() > 0) { - index_type stride = 1; - using common_t = std::common_type_t; - for(rank_type r=__extents.rank(); r>0; r--) { - if(static_cast(stride) != static_cast(other.stride(r-1))) - std::abort(); // ("Assigning layout_stride to layout_right with invalid strides."); - stride *= __extents.extent(r-1); - } - } - #endif + detail::validate_strides(detail::with_rank{}, layout_right{}, __extents, other); } MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED mapping& operator=(mapping const&) noexcept = default; @@ -195,6 +187,9 @@ class layout_right::mapping { ) _MDSPAN_HOST_DEVICE constexpr index_type operator()(Indices... idxs) const noexcept { +#if ! defined(NDEBUG) + detail::check_all_indices(this->extents(), idxs...); +#endif // ! NDEBUG return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast(idxs)...); } diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp index 15ad577d149..d6cdad2ab23 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp @@ -19,14 +19,16 @@ #include "extents.hpp" #include "trait_backports.hpp" #include "compressed_pair.hpp" +#include "utility.hpp" #if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) # include "no_unique_address.hpp" #endif -#include -#include #include +#include +#include + #ifdef __cpp_lib_span #include #endif @@ -38,11 +40,11 @@ namespace MDSPAN_IMPL_STANDARD_NAMESPACE { struct layout_left { template - class mapping; + class mapping; }; struct layout_right { template - class mapping; + class mapping; }; namespace detail { @@ -79,6 +81,7 @@ namespace detail { std::bool_constant::value; }; #endif + } // namespace detail struct layout_stride { @@ -199,6 +202,20 @@ struct layout_stride { return __strides_storage_t{static_cast(s[Idxs])...}; } + MDSPAN_TEMPLATE_REQUIRES( + class IntegralType, + // The is_convertible condition is added to make sfinae valid + // the extents_type::rank() > 0 is added to avoid use of non-standard zero length c-array + (std::is_convertible::value && (extents_type::rank() > 0)) + ) + MDSPAN_INLINE_FUNCTION + // despite the requirement some compilers still complain about zero length array during parsing + // making it length 1 now, but since the thing can't be instantiated due to requirement the actual + // instantiation of strides_storage will not fail despite mismatching length + static constexpr const __strides_storage_t fill_strides(mdspan_non_standard_tag, const IntegralType (&s)[extents_type::rank()>0?extents_type::rank():1]) { + return __strides_storage_t{static_cast(s[Idxs])...}; + } + #ifdef __cpp_lib_span template MDSPAN_INLINE_FUNCTION @@ -225,7 +242,11 @@ struct layout_stride { // Can't use defaulted parameter in the __deduction_workaround template because of a bug in MSVC warning C4348. using __impl = __deduction_workaround>; - static constexpr __strides_storage_t strides_storage(std::true_type) { + static constexpr __strides_storage_t strides_storage(detail::with_rank<0>) { + return {}; + } + template + static constexpr __strides_storage_t strides_storage(detail::with_rank) { __strides_storage_t s{}; extents_type e; @@ -237,9 +258,6 @@ struct layout_stride { return s; } - static constexpr __strides_storage_t strides_storage(std::false_type) { - return {}; - } //---------------------------------------------------------------------------- @@ -262,7 +280,7 @@ struct layout_stride { : __base_t(__base_t{__member_pair_t( #endif extents_type(), - __strides_storage_t(strides_storage(std::integral_constant 0)>{})) + __strides_storage_t(strides_storage(detail::with_rank{})) #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) } #else @@ -309,6 +327,48 @@ struct layout_stride { */ } + MDSPAN_TEMPLATE_REQUIRES( + class IntegralTypes, + /* requires */ ( + // MSVC 19.32 does not like using index_type here, requires the typename Extents::index_type + // error C2641: cannot deduce template arguments for 'MDSPAN_IMPL_STANDARD_NAMESPACE::layout_stride::mapping' + _MDSPAN_TRAIT(std::is_convertible, const std::remove_const_t&, typename Extents::index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, typename Extents::index_type, const std::remove_const_t&) && + (Extents::rank() > 0) + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr + mapping( + mdspan_non_standard_tag, + extents_type const& e, + // despite the requirement some compilers still complain about zero length array during parsing + // making it length 1 now, but since the thing can't be instantiated due to requirement the actual + // instantiation of strides_storage will not fail despite mismatching length + IntegralTypes (&s)[extents_type::rank()>0?extents_type::rank():1] + ) noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + e, __strides_storage_t(__impl::fill_strides(mdspan_non_standard, s)) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + { + /* + * TODO: check preconditions + * - s[i] > 0 is true for all i in the range [0, rank_ ). + * - REQUIRED-SPAN-SIZE(e, s) is a representable value of type index_type ([basic.fundamental]). + * - If rank_ is greater than 0, then there exists a permutation P of the integers in the + * range [0, rank_), such that s[ pi ] >= s[ piā€…āˆ’ā€…1 ] * e.extent( piā€…āˆ’ā€…1 ) is true for + * all i in the range [1, rank_ ), where pi is the ith element of P. + */ + } + #ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES( class IntegralTypes, @@ -434,6 +494,9 @@ struct layout_stride { ) MDSPAN_FORCE_INLINE_FUNCTION constexpr index_type operator()(Indices... idxs) const noexcept { +#if ! defined(NDEBUG) + detail::check_all_indices(this->extents(), idxs...); +#endif // ! NDEBUG return static_cast(__impl::_call_op_impl(*this, static_cast(idxs)...)); } @@ -444,32 +507,48 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } - MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { - if constexpr (extents_type::rank() == 0) - return true; - else { - index_type span_size = required_span_size(); - if (span_size == static_cast(0)) { - if constexpr (extents_type::rank() == 1) { - return stride(0) == 1; - } else { - rank_type r_largest = 0; - for (rank_type r = 1; r < extents_type::rank(); r++) { - if (stride(r) > stride(r_largest)) { - r_largest = r; - } - } - for (rank_type r = 0; r < extents_type::rank(); r++) { - if (extents().extent(r) == 0 && r != r_largest) { - return false; - } - } - return true; - } - } else { - return required_span_size() == __get_size(extents(), std::make_index_sequence()); + + private: + constexpr bool exhaustive_for_nonzero_span_size() const + { + return required_span_size() == __get_size(extents(), std::make_index_sequence()); + } + + constexpr bool is_exhaustive_impl(detail::with_rank<0>) const + { + return true; + } + constexpr bool is_exhaustive_impl(detail::with_rank<1>) const + { + if (required_span_size() != static_cast(0)) { + return exhaustive_for_nonzero_span_size(); + } + return stride(0) == 1; + } + template + constexpr bool is_exhaustive_impl(detail::with_rank) const + { + if (required_span_size() != static_cast(0)) { + return exhaustive_for_nonzero_span_size(); + } + + rank_type r_largest = 0; + for (rank_type r = 1; r < extents_type::rank(); r++) { + if (stride(r) > stride(r_largest)) { + r_largest = r; } } + for (rank_type r = 0; r < extents_type::rank(); r++) { + if (extents().extent(r) == 0 && r != r_largest) { + return false; + } + } + return true; + } + + public: + MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { + return is_exhaustive_impl(detail::with_rank{}); } MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } @@ -498,15 +577,9 @@ struct layout_stride { #endif MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const mapping& x, const StridedLayoutMapping& y) noexcept { - bool strides_match = true; - if constexpr (extents_type::rank() > 0) { - using common_t = std::common_type_t; - for(rank_type r = 0; r < extents_type::rank(); r++) - strides_match = strides_match && (static_cast(x.stride(r)) == static_cast(y.stride(r))); - } return (x.extents() == y.extents()) && (__impl::__OFFSET(y) == static_cast(0)) && - strides_match; + detail::rankwise_equal(detail::with_rank{}, x, y, detail::stride); } // This one is not technically part of the proposal. Just here to make implementation a bit more optimal hopefully @@ -532,7 +605,7 @@ struct layout_stride { ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(const mapping& x, const StridedLayoutMapping& y) noexcept { - return not (x == y); + return !(x == y); } MDSPAN_TEMPLATE_REQUIRES( @@ -561,4 +634,34 @@ struct layout_stride { }; }; +namespace detail { + +template +constexpr void validate_strides(with_rank<0>, Layout, const Extents&, const Mapping&) +{} + +template +constexpr void validate_strides(with_rank, Layout, const Extents& ext, const Mapping& other) +{ + static_assert(std::is_same::value && + (std::is_same::value || + std::is_same::value) + , "This function is only intended to validate construction of " + "a layout_left or layout_right mapping from a layout_stride mapping."); + + constexpr auto is_left = std::is_same::value; + + typename Extents::index_type expected_stride = 1; + + for (std::size_t r = 0; r < N; r++) { + const std::size_t s = is_left ? r : N - 1 - r; + + MDSPAN_IMPL_PRECONDITION(common_integral_compare(expected_stride, other.stride(s)) + && "invalid strides for layout_{left,right}"); + + expected_stride *= ext.extent(s); + } +} + +} // namespace detail } // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp index 3eeb39755c8..b60c4261779 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp @@ -18,7 +18,12 @@ #include "config.hpp" +#include +#include #include // std::is_void +#if defined(_MDSPAN_HAS_CUDA) || defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_SYCL) +#include "assert.h" +#endif #ifndef _MDSPAN_HOST_DEVICE # if defined(_MDSPAN_HAS_CUDA) || defined(_MDSPAN_HAS_HIP) @@ -101,6 +106,69 @@ #define MDSPAN_IMPL_STANDARD_NAMESPACE_STRING MDSPAN_PP_STRINGIFY(MDSPAN_IMPL_STANDARD_NAMESPACE) #define MDSPAN_IMPL_PROPOSED_NAMESPACE_STRING MDSPAN_PP_STRINGIFY(MDSPAN_IMPL_STANDARD_NAMESPACE) "::" MDSPAN_PP_STRINGIFY(MDSPAN_IMPL_PROPOSED_NAMESPACE) +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace detail { + +#if defined(_MDSPAN_HAS_CUDA) || defined(_MDSPAN_HAS_HIP) +MDSPAN_FUNCTION inline void default_precondition_violation_handler(const char* cond, const char* file, unsigned line) +{ + printf("%s:%u: precondition failure: `%s`\n", file, line, cond); + assert(0); +} +#elif defined(_MDSPAN_HAS_SYCL) +MDSPAN_FUNCTION inline void default_precondition_violation_handler(const char* cond, const char* file, unsigned line) +{ + sycl::ext::oneapi::experimental::printf("%s:%u: precondition failure: `%s`\n", file, line, cond); + assert(0); +} +#else +MDSPAN_FUNCTION inline void default_precondition_violation_handler(const char* cond, const char* file, unsigned line) +{ + std::fprintf(stderr, "%s:%u: precondition failure: `%s`\n", file, line, cond); + std::abort(); +} +#endif + +} // namespace detail +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE + +#ifndef MDSPAN_IMPL_PRECONDITION_VIOLATION_HANDLER +#define MDSPAN_IMPL_PRECONDITION_VIOLATION_HANDLER(cond, file, line) \ + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::default_precondition_violation_handler(cond, file, line) +#endif + +#ifndef MDSPAN_IMPL_CHECK_PRECONDITION + #ifndef NDEBUG + #define MDSPAN_IMPL_CHECK_PRECONDITION 0 + #else + #define MDSPAN_IMPL_CHECK_PRECONDITION 1 + #endif +#endif + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace detail { + +template +MDSPAN_FUNCTION constexpr void precondition(const char* cond, const char* file, unsigned line) +{ + if (!check) { return; } + // in case the macro doesn't use the arguments for custom macros + (void) cond; + (void) file; + (void) line; + MDSPAN_IMPL_PRECONDITION_VIOLATION_HANDLER(cond, file, line); +} + +} // namespace detail +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE + +#define MDSPAN_IMPL_PRECONDITION(...) \ + do { \ + if (!(__VA_ARGS__)) { \ + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::precondition(#__VA_ARGS__, __FILE__, __LINE__); \ + } \ + } while (0) + // end Preprocessor helpers }}}1 //============================================================================== @@ -574,7 +642,7 @@ __fold_left_assign_impl(Args&&... args) { template -constexpr __mdspan_enable_fold_comma __fold_comma_impl(Args&&... args) noexcept { return { }; } +constexpr __mdspan_enable_fold_comma __fold_comma_impl(Args&&...) noexcept { return { }; } template struct __bools; diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp index d6ec49e65bf..23114aa5506 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp @@ -34,6 +34,8 @@ class mdspan private: static_assert(detail::__is_extents_v, MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::mdspan's Extents template parameter must be a specialization of " MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::extents."); + static_assert(std::is_same::value, + MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::mdspan's ElementType template parameter must be the same as its AccessorPolicy::element_type."); // Workaround for non-deducibility of the index sequence template parameter if it's given at the top level template @@ -321,7 +323,7 @@ class mdspan #endif // MDSPAN_USE_PAREN_OPERATOR MDSPAN_INLINE_FUNCTION constexpr size_type size() const noexcept { - return __impl::__size(*this); + return static_cast(__impl::__size(*this)); }; MDSPAN_INLINE_FUNCTION constexpr bool empty() const noexcept { diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/utility.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/utility.hpp new file mode 100644 index 00000000000..e690cd6939b --- /dev/null +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/utility.hpp @@ -0,0 +1,72 @@ +#pragma once + +#include +#include + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace detail { + +// type alias used for rank-based tag dispatch +// +// this is used to enable alternatives to constexpr if when building for C++14 +// +template +using with_rank = std::integral_constant; + +template +MDSPAN_INLINE_FUNCTION +constexpr bool common_integral_compare(I1 x, I2 y) +{ + static_assert(std::is_integral::value && + std::is_integral::value, ""); + + using I = std::common_type_t; + return static_cast(x) == static_cast(y); +} + +template +MDSPAN_INLINE_FUNCTION +constexpr bool rankwise_equal(with_rank<0>, const T1&, const T2&, F) +{ + return true; +} + +template +MDSPAN_INLINE_FUNCTION +constexpr bool rankwise_equal(with_rank, const T1& x, const T2& y, F func) +{ + bool match = true; + + for (std::size_t r = 0; r < N; r++) { + match = match && common_integral_compare(func(x, r), func(y, r)); + } + + return match; +} + +constexpr struct +{ + template + MDSPAN_INLINE_FUNCTION + constexpr auto operator()(const T& x, I i) const + { + return x.extent(i); + } +} extent; + +constexpr struct +{ + template + MDSPAN_INLINE_FUNCTION + constexpr auto operator()(const T& x, I i) const + { + return x.stride(i); + } +} stride; + +} // namespace detail + +constexpr struct mdspan_non_standard_tag { +} mdspan_non_standard; + +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_Graph.cpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2389_bits/dims.hpp similarity index 59% rename from lib/kokkos/core/unit_test/hip/TestHIP_Graph.cpp rename to lib/kokkos/tpls/mdspan/include/experimental/__p2389_bits/dims.hpp index 405cb76c643..00045215c48 100644 --- a/lib/kokkos/core/unit_test/hip/TestHIP_Graph.cpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2389_bits/dims.hpp @@ -14,5 +14,15 @@ // //@HEADER -#include -#include +#pragma once + +// backward compatibility import into experimental +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +template< ::std::size_t Rank, class IndexType = std::size_t> +using dims = + :: MDSPAN_IMPL_STANDARD_NAMESPACE :: dextents; + +} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp index ca6948c9a9f..e1390fdeb57 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp @@ -17,10 +17,30 @@ #pragma once #include -#include #include +#include #include // index_sequence +// Suppress spurious warning with NVCC about no return statement. +// This is a known issue in NVCC and NVC++ +// Depending on the CUDA and GCC version we need both the builtin +// and the diagnostic push. I tried really hard to find something shorter +// but no luck ... +#if defined __NVCC__ +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress = implicit_return_from_non_void_function +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic push +#pragma diag_suppress implicit_return_from_non_void_function +#endif +#endif +#elif defined __NVCOMPILER +#pragma diagnostic push +#pragma diag_suppress = implicit_return_from_non_void_function +#endif + namespace MDSPAN_IMPL_STANDARD_NAMESPACE { //****************************************** // Return type of submdspan_mapping overloads @@ -31,18 +51,68 @@ template struct submdspan_mapping_result { }; namespace detail { +// We use const Slice& and not Slice&& because the various +// submdspan_mapping_impl overloads use their slices arguments +// multiple times. This makes perfect forwarding not useful, but we +// still don't want to pass those (possibly of size 64 x 3 bits) +// objects by value. +template +MDSPAN_INLINE_FUNCTION constexpr bool +one_slice_out_of_bounds(const IndexType &ext, const Slice &slice) { + using common_t = + std::common_type_t; + return static_cast(detail::first_of(slice)) == + static_cast(ext); +} + +template +MDSPAN_INLINE_FUNCTION constexpr bool +any_slice_out_of_bounds_helper(std::index_sequence, + const extents &exts, + const Slices &... slices) { + return _MDSPAN_FOLD_OR( + (one_slice_out_of_bounds(exts.extent(RankIndices), slices))); +} + +template +MDSPAN_INLINE_FUNCTION constexpr bool +any_slice_out_of_bounds(const extents &exts, + const Slices &... slices) { + return any_slice_out_of_bounds_helper( + std::make_index_sequence(), exts, slices...); +} + // constructs sub strides template -MDSPAN_INLINE_FUNCTION -constexpr auto -construct_sub_strides(const SrcMapping &src_mapping, - std::index_sequence, - const std::tuple &slices_stride_factor) { +MDSPAN_INLINE_FUNCTION constexpr auto construct_sub_strides( + const SrcMapping &src_mapping, std::index_sequence, + const std::tuple &slices_stride_factor) { using index_type = typename SrcMapping::index_type; return std::array{ (static_cast(src_mapping.stride(InvMapIdxs)) * static_cast(std::get(slices_stride_factor)))...}; } + +template +struct is_range_slice { + constexpr static bool value = + std::is_same_v || + std::is_convertible_v>; +}; + +template +constexpr bool is_range_slice_v = is_range_slice::value; + +template +struct is_index_slice { + constexpr static bool value = std::is_convertible_v; +}; + +template +constexpr bool is_index_slice_v = is_index_slice::value; + } // namespace detail //********************************** @@ -51,52 +121,90 @@ construct_sub_strides(const SrcMapping &src_mapping, namespace detail { // Figure out whether to preserve layout_left -template -struct preserve_layout_left_mapping; +template +struct deduce_layout_left_submapping; -template -struct preserve_layout_left_mapping, SubRank, - SliceSpecifiers...> { - constexpr static bool value = - // Preserve layout for rank 0 - (SubRank == 0) || - ( - // Slice specifiers up to subrank need to be full_extent_t - except - // for the last one which could also be tuple but not a strided index - // range slice specifiers after subrank are integrals - ((Idx > SubRank - 1) || // these are only integral slice specifiers - (std::is_same_v) || - ((Idx == SubRank - 1) && - std::is_convertible_v>)) && - ...); +template +struct deduce_layout_left_submapping< + IndexType, SubRank, std::index_sequence, SliceSpecifiers...> { + + using count_range = index_sequence_scan_impl< + 0u, (is_index_slice_v ? 0u : 1u)...>; + + constexpr static int gap_len = + (((Idx > 0 && count_range::get(Idx) == 1 && + is_index_slice_v) + ? 1 + : 0) + + ... + 0); + + MDSPAN_INLINE_FUNCTION + constexpr static bool layout_left_value() { + // Use layout_left for rank 0 + if constexpr (SubRank == 0) { + return true; + // Use layout_left for rank 1 result if leftmost slice specifier is range like + } else if constexpr (SubRank == 1) { + return ((Idx > 0 || is_range_slice_v)&&...); + } else { + // Preserve if leftmost SubRank-1 slices are full_extent_t and + // the slice at idx Subrank - 1 is a range and + // for idx > SubRank the slice is an index + return ((((Idx < SubRank - 1) && std::is_same_v) || + ((Idx == SubRank - 1) && is_range_slice_v) || + ((Idx > SubRank - 1) && is_index_slice_v)) && ...); + } +#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) + __builtin_unreachable(); +#endif + } + + MDSPAN_INLINE_FUNCTION + constexpr static bool layout_left_padded_value() { + // Technically could also keep layout_left_padded for SubRank==0 + // and SubRank==1 with leftmost slice specifier being a contiguous range + // but we intercept these cases separately + + // In all other cases: + // leftmost slice must be range + // then there can be a gap with index slices + // then SubRank - 2 full_extent slices + // then another range slice + // then more index slices + // e.g. R I I I F F F R I I for obtaining a rank-5 from a rank-10 + return ((((Idx == 0) && is_range_slice_v) || + ((Idx > 0 && Idx <= gap_len) && is_index_slice_v) || + ((Idx > gap_len && Idx < gap_len + SubRank - 1) && std::is_same_v) || + ((Idx == gap_len + SubRank - 1) && is_range_slice_v) || + ((Idx > gap_len + SubRank - 1) && is_index_slice_v)) && ... ); + } }; + +// We are reusing the same thing for layout_left and layout_left_padded +// For layout_left as source StaticStride is static_extent(0) +template +struct compute_s_static_layout_left { + // Neither StaticStride nor any of the provided extents can be zero. + // StaticStride can never be zero, the static_extents we are looking at are associated with + // integral slice specifiers - which wouldn't be valid for zero extent + template + MDSPAN_INLINE_FUNCTION + static constexpr size_t value(std::index_sequence) { + size_t val = ((Idx>0 && Idx<=NumGaps ? (Extents::static_extent(Idx) == dynamic_extent?0:Extents::static_extent(Idx)) : 1) * ... * (StaticStride == dynamic_extent?0:StaticStride)); + return val == 0?dynamic_extent:val; + } +}; + } // namespace detail -// Suppress spurious warning with NVCC about no return statement. -// This is a known issue in NVCC and NVC++ -// Depending on the CUDA and GCC version we need both the builtin -// and the diagnostic push. I tried really hard to find something shorter -// but no luck ... -#if defined __NVCC__ - #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ - #pragma nv_diagnostic push - #pragma nv_diag_suppress = implicit_return_from_non_void_function - #else - #ifdef __CUDA_ARCH__ - #pragma diagnostic push - #pragma diag_suppress implicit_return_from_non_void_function - #endif - #endif -#elif defined __NVCOMPILER - #pragma diagnostic push - #pragma diag_suppress = implicit_return_from_non_void_function -#endif // Actual submdspan mapping call template template -MDSPAN_INLINE_FUNCTION -constexpr auto -layout_left::mapping::submdspan_mapping_impl(SliceSpecifiers... slices) const { +MDSPAN_INLINE_FUNCTION constexpr auto +layout_left::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { // compute sub extents using src_ext_t = Extents; @@ -104,51 +212,137 @@ layout_left::mapping::submdspan_mapping_impl(SliceSpecifiers... slices) using dst_ext_t = decltype(dst_ext); // figure out sub layout type - constexpr bool preserve_layout = detail::preserve_layout_left_mapping< - decltype(std::make_index_sequence()), dst_ext_t::rank(), - SliceSpecifiers...>::value; - using dst_layout_t = - std::conditional_t; - using dst_mapping_t = typename dst_layout_t::template mapping; - - if constexpr (std::is_same_v) { + using deduce_layout = detail::deduce_layout_left_submapping< + typename dst_ext_t::index_type, dst_ext_t::rank(), + std::make_index_sequence, + SliceSpecifiers...>; + + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + const bool out_of_bounds = + detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast( + out_of_bounds ? this->required_span_size() + : this->operator()(detail::first_of(slices)...)); + + if constexpr (deduce_layout::layout_left_value()) { // layout_left case + using dst_mapping_t = typename layout_left::template mapping; + return submdspan_mapping_result{dst_mapping_t(dst_ext), + offset}; + } else if constexpr (deduce_layout::layout_left_padded_value()) { + constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_left::value(std::make_index_sequence()); + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded::template mapping; return submdspan_mapping_result{ - dst_mapping_t(dst_ext), - static_cast(this->operator()(detail::first_of(slices)...))}; + dst_mapping_t(dst_ext, stride(1 + deduce_layout::gap_len)), offset}; } else { // layout_stride case - auto inv_map = detail::inv_map_rank( - std::integral_constant(), - std::index_sequence<>(), - slices...); - return submdspan_mapping_result{ - dst_mapping_t(dst_ext, detail::construct_sub_strides( - *this, inv_map, - // HIP needs deduction guides to have markups so we need to be explicit - // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue - #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) - std::tuple{detail::stride_of(slices)...})), - #else - std::tuple{detail::stride_of(slices)...})), - #endif - static_cast(this->operator()(detail::first_of(slices)...))}; + using dst_mapping_t = typename layout_stride::mapping; + auto inv_map = detail::inv_map_rank(std::integral_constant(), + std::index_sequence<>(), slices...); + return submdspan_mapping_result { + dst_mapping_t(dst_ext, + detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue But Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA altogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) + std::tuple{ + detail::stride_of(slices)...})), +#else + std::tuple{detail::stride_of(slices)...})), +#endif + offset + }; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); #endif } -#if defined __NVCC__ - #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ - #pragma nv_diagnostic pop - #else - #ifdef __CUDA_ARCH__ - #pragma diagnostic pop - #endif - #endif -#elif defined __NVCOMPILER - #pragma diagnostic pop + +template +template +template +MDSPAN_INLINE_FUNCTION constexpr auto +MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + + // compute sub extents + using src_ext_t = Extents; + auto dst_ext = submdspan_extents(extents(), slices...); + using dst_ext_t = decltype(dst_ext); + + if constexpr (Extents::rank() == 0) { // rank-0 case + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded::template mapping; + return submdspan_mapping_result{*this, 0}; + } else { + const bool out_of_bounds = + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast( + out_of_bounds ? this->required_span_size() + : this->operator()(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::first_of(slices)...)); + if constexpr (dst_ext_t::rank() == 0) { // result rank-0 + // The following for some reasons leads to compiler error later, while not using a typedef works: + // Compilers: CUDA 11.2 with GCC 9.1 + // + // using dst_mapping_t = typename layout_left::template mapping; + // return submdspan_mapping_result{dst_mapping_t{dst_ext}, offset}; + // + // Error: submdspan_mapping.hpp:299:23: error: 'dst_mapping_t' does not name a type + // 299 | using dst_mapping_t = typename layout_left::template mapping; + // The same error is given (about dst_mapping_t not naming type) when a different name is used in 299: + // using dst_mapping_t2 = typename layout_left::template mapping; + + return submdspan_mapping_result> + {typename layout_left::template mapping{dst_ext}, offset}; + } else { // general case + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + // figure out sub layout type + using deduce_layout = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::deduce_layout_left_submapping< + typename dst_ext_t::index_type, dst_ext_t::rank(), + decltype(std::make_index_sequence()), + SliceSpecifiers...>; + + if constexpr (deduce_layout::layout_left_value() && dst_ext_t::rank() == 1) { // getting rank-1 from leftmost + using dst_mapping_t = typename layout_left::template mapping; + return submdspan_mapping_result{dst_mapping_t{dst_ext}, offset}; + } else if constexpr (deduce_layout::layout_left_padded_value()) { // can keep layout_left_padded + constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_left::value(std::make_index_sequence()); + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded::template mapping; + return submdspan_mapping_result{ + dst_mapping_t(dst_ext, stride(1 + deduce_layout::gap_len)), offset}; + } else { // layout_stride + auto inv_map = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::inv_map_rank(std::integral_constant(), + std::index_sequence<>(), slices...); + using dst_mapping_t = typename layout_stride::template mapping; + return submdspan_mapping_result { + dst_mapping_t(dst_ext, + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue But Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA alltogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) + std::tuple{ + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...})), +#else + std::tuple{MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...})), #endif + offset + }; + } + } + } + + +#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) + __builtin_unreachable(); +#endif +} //********************************** // layout_right submdspan_mapping @@ -156,134 +350,276 @@ layout_left::mapping::submdspan_mapping_impl(SliceSpecifiers... slices) namespace detail { // Figure out whether to preserve layout_right -template -struct preserve_layout_right_mapping; +template +struct deduce_layout_right_submapping; -template -struct preserve_layout_right_mapping, SubRank, - SliceSpecifiers...> { - constexpr static size_t SrcRank = sizeof...(SliceSpecifiers); - constexpr static bool value = - // Preserve layout for rank 0 - (SubRank == 0) || - ( - // The last subrank slice specifiers need to be full_extent_t - except - // for the srcrank-subrank one which could also be tuple but not a - // strided index range slice specifiers before srcrank-subrank are - // integrals - ((Idx < - SrcRank - SubRank) || // these are only integral slice specifiers - (std::is_same_v) || - ((Idx == SrcRank - SubRank) && - std::is_convertible_v>)) && - ...); +template +struct deduce_layout_right_submapping< + IndexType, SubRank, std::index_sequence, SliceSpecifiers...> { + + static constexpr size_t Rank = sizeof...(Idx); + using count_range = index_sequence_scan_impl< + 0u, (std::is_convertible_v ? 0u : 1u)...>; + //__static_partial_sums...>; + constexpr static int gap_len = + (((Idx < Rank - 1 && count_range::get(Idx) == SubRank - 1 && + std::is_convertible_v) + ? 1 + : 0) + + ... + 0); + + MDSPAN_INLINE_FUNCTION + constexpr static bool layout_right_value() { + // Use layout_right for rank 0 + if constexpr (SubRank == 0) { + return true; + // Use layout_right for rank 1 result if rightmost slice specifier is range like + } else if constexpr (SubRank == 1) { + return ((Idx < Rank - 1 || is_range_slice_v)&&...); + } else { + // Preserve if rightmost SubRank-1 slices are full_extent_t and + // the slice at idx Rank-Subrank is a range and + // for idx < Rank - SubRank the slice is an index + return ((((Idx >= Rank - SubRank) && std::is_same_v) || + ((Idx == Rank - SubRank) && is_range_slice_v) || + ((Idx < Rank - SubRank) && is_index_slice_v)) && ...); + } +#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) + __builtin_unreachable(); +#endif + } + + MDSPAN_INLINE_FUNCTION + constexpr static bool layout_right_padded_value() { + // Technically could also keep layout_right_padded for SubRank==0 + // and SubRank==1 with rightmost slice specifier being a contiguous range + // but we intercept these cases separately + + // In all other cases: + // rightmost slice must be range + // then there can be a gap with index slices + // then SubRank - 2 full_extent slices + // then another range slice + // then more index slices + // e.g. I I R F F F I I I R for obtaining a rank-5 from a rank-10 + return ((((Idx == Rank - 1) && is_range_slice_v) || + ((Idx >= Rank - gap_len - 1 && Idx < Rank - 1) && is_index_slice_v) || + ((Idx > Rank - gap_len - SubRank && Idx < Rank - gap_len - 1) && std::is_same_v) || + ((Idx == Rank - gap_len - SubRank) && is_range_slice_v) || + ((Idx < Rank - gap_len - SubRank) && is_index_slice_v)) && ... ); + } }; + +// We are reusing the same thing for layout_right and layout_right_padded +// For layout_right as source StaticStride is static_extent(Rank-1) +template +struct compute_s_static_layout_right { + // Neither StaticStride nor any of the provided extents can be zero. + // StaticStride can never be zero, the static_extents we are looking at are associated with + // integral slice specifiers - which wouldn't be valid for zero extent + template + MDSPAN_INLINE_FUNCTION + static constexpr size_t value(std::index_sequence) { + size_t val = ((Idx >= Extents::rank() - 1 - NumGaps && Idx < Extents::rank() - 1 ? (Extents::static_extent(Idx) == dynamic_extent?0:Extents::static_extent(Idx)) : 1) * ... * (StaticStride == dynamic_extent?0:StaticStride)); + return val == 0?dynamic_extent:val; + } +}; + } // namespace detail -// Suppress spurious warning with NVCC about no return statement. -// This is a known issue in NVCC and NVC++ -// Depending on the CUDA and GCC version we need both the builtin -// and the diagnostic push. I tried really hard to find something shorter -// but no luck ... -#if defined __NVCC__ - #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ - #pragma nv_diagnostic push - #pragma nv_diag_suppress = implicit_return_from_non_void_function - #else - #ifdef __CUDA_ARCH__ - #pragma diagnostic push - #pragma diag_suppress implicit_return_from_non_void_function - #endif - #endif -#elif defined __NVCOMPILER - #pragma diagnostic push - #pragma diag_suppress = implicit_return_from_non_void_function -#endif +// Actual submdspan mapping call template template -MDSPAN_INLINE_FUNCTION -constexpr auto +MDSPAN_INLINE_FUNCTION constexpr auto layout_right::mapping::submdspan_mapping_impl( - SliceSpecifiers... slices) const { - // get sub extents + SliceSpecifiers... slices) const { + + // compute sub extents using src_ext_t = Extents; auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); - // determine new layout type - constexpr bool preserve_layout = detail::preserve_layout_right_mapping< - decltype(std::make_index_sequence()), dst_ext_t::rank(), - SliceSpecifiers...>::value; - using dst_layout_t = - std::conditional_t; - using dst_mapping_t = typename dst_layout_t::template mapping; + // figure out sub layout type + using deduce_layout = detail::deduce_layout_right_submapping< + typename dst_ext_t::index_type, dst_ext_t::rank(), + std::make_index_sequence, + SliceSpecifiers...>; + + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + const bool out_of_bounds = + detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast( + out_of_bounds ? this->required_span_size() + : this->operator()(detail::first_of(slices)...)); - if constexpr (std::is_same_v) { + if constexpr (deduce_layout::layout_right_value()) { // layout_right case + using dst_mapping_t = typename layout_right::mapping; + return submdspan_mapping_result{dst_mapping_t(dst_ext), + offset}; + } else if constexpr (deduce_layout::layout_right_padded_value()) { + constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_left::value(std::make_index_sequence()); + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded::template mapping; return submdspan_mapping_result{ - dst_mapping_t(dst_ext), - static_cast(this->operator()(detail::first_of(slices)...))}; + dst_mapping_t(dst_ext, + stride(src_ext_t::rank() - 2 - deduce_layout::gap_len)), + offset}; } else { // layout_stride case - auto inv_map = detail::inv_map_rank( - std::integral_constant(), - std::index_sequence<>(), - slices...); - return submdspan_mapping_result{ - dst_mapping_t(dst_ext, detail::construct_sub_strides( - *this, inv_map, - // HIP needs deduction guides to have markups so we need to be explicit - // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue - #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) - std::tuple{detail::stride_of(slices)...})), - #else - std::tuple{detail::stride_of(slices)...})), - #endif - static_cast(this->operator()(detail::first_of(slices)...))}; + using dst_mapping_t = typename layout_stride::mapping; + auto inv_map = detail::inv_map_rank(std::integral_constant(), + std::index_sequence<>(), slices...); + return submdspan_mapping_result { + dst_mapping_t(dst_ext, + detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue But Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA altogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) + std::tuple{ + detail::stride_of(slices)...})), +#else + std::tuple{detail::stride_of(slices)...})), +#endif + offset + }; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); #endif } -#if defined __NVCC__ - #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ - #pragma nv_diagnostic pop - #else - #ifdef __CUDA_ARCH__ - #pragma diagnostic pop - #endif - #endif -#elif defined __NVCOMPILER - #pragma diagnostic pop + +template +template +template +MDSPAN_INLINE_FUNCTION constexpr auto +MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + + // compute sub extents + using src_ext_t = Extents; + auto dst_ext = submdspan_extents(extents(), slices...); + using dst_ext_t = decltype(dst_ext); + + if constexpr (Extents::rank() == 0) { // rank-0 case + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded::template mapping; + return submdspan_mapping_result{*this, 0}; + } else { + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + // figure out sub layout type + const bool out_of_bounds = + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast( + out_of_bounds ? this->required_span_size() + : this->operator()(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::first_of(slices)...)); + if constexpr (dst_ext_t::rank() == 0) { // result rank-0 + // Same issue as in layout_left_padded: see comment there + // using dst_mapping_t = typename layout_right::template mapping; + // return submdspan_mapping_result{dst_mapping_t{dst_ext}, offset}; + return submdspan_mapping_result> + {typename layout_right::template mapping{dst_ext}, offset}; + } else { // general case + using deduce_layout = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::deduce_layout_right_submapping< + typename dst_ext_t::index_type, dst_ext_t::rank(), + decltype(std::make_index_sequence()), + SliceSpecifiers...>; + + if constexpr (deduce_layout::layout_right_value() && dst_ext_t::rank() == 1) { // getting rank-1 from rightmost + using dst_mapping_t = typename layout_right::template mapping; + return submdspan_mapping_result{dst_mapping_t{dst_ext}, offset}; + } else if constexpr (deduce_layout::layout_right_padded_value()) { // can keep layout_right_padded + constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_right::value(std::make_index_sequence()); + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded::template mapping; + return submdspan_mapping_result{ + dst_mapping_t(dst_ext, stride(Extents::rank() - 2 - deduce_layout::gap_len)), offset}; + } else { // layout_stride + auto inv_map = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::inv_map_rank(std::integral_constant(), + std::index_sequence<>(), slices...); + using dst_mapping_t = typename layout_stride::template mapping; + return submdspan_mapping_result { + dst_mapping_t(dst_ext, + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue But Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA alltogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) + std::tuple{ + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...})), +#else + std::tuple{MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...})), #endif + offset + }; + } + } + } + + +#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) + __builtin_unreachable(); +#endif +} //********************************** // layout_stride submdspan_mapping //********************************* template template -MDSPAN_INLINE_FUNCTION -constexpr auto +MDSPAN_INLINE_FUNCTION constexpr auto layout_stride::mapping::submdspan_mapping_impl( - SliceSpecifiers... slices) const { + SliceSpecifiers... slices) const { auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); - auto inv_map = detail::inv_map_rank( - std::integral_constant(), - std::index_sequence<>(), - slices...); + auto inv_map = detail::inv_map_rank(std::integral_constant(), + std::index_sequence<>(), slices...); using dst_mapping_t = typename layout_stride::template mapping; - return submdspan_mapping_result{ - dst_mapping_t(dst_ext, detail::construct_sub_strides( - *this, inv_map, - // HIP needs deduction guides to have markups so we need to be explicit - // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue - #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) - std::tuple(detail::stride_of(slices)...))), + + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + const bool out_of_bounds = + detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast( + out_of_bounds ? this->required_span_size() + : this->operator()(detail::first_of(slices)...)); + + return submdspan_mapping_result { + dst_mapping_t(dst_ext, + detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue +#if defined(_MDSPAN_HAS_HIP) || \ + (defined(__NVCC__) && \ + (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) + std::tuple( + detail::stride_of(slices)...))), #else - std::tuple(detail::stride_of(slices)...))), + std::tuple(detail::stride_of(slices)...))), #endif - static_cast(this->operator()(detail::first_of(slices)...))}; + offset + }; } } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE + +#if defined __NVCC__ +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic pop +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic pop +#endif +#endif +#elif defined __NVCOMPILER +#pragma diagnostic pop +#endif diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp index a8014867923..e5f7bee4cad 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp @@ -59,6 +59,10 @@ MDSPAN_INLINE_FUNCTION constexpr size_t get_actual_static_padding_value() { } else { return dynamic_extent; } + // Missing return statement warning from NVCC and ICC +#if defined(__NVCC__) || defined(__INTEL_COMPILER) + return 0; +#endif } template @@ -69,7 +73,7 @@ struct static_array_type_for_padded_extent using extents_type = _Extents; using type = ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< index_type, size_t, dynamic_extent, - detail::get_actual_static_padding_value()>; }; @@ -101,6 +105,10 @@ struct padded_extent { } else { return init_padding(exts, padding_value); } + // Missing return statement warning from NVCC and ICC +#if defined(__NVCC__) || defined(__INTEL_COMPILER) + return {}; +#endif } MDSPAN_INLINE_FUNCTION static constexpr static_array_type @@ -112,6 +120,10 @@ struct padded_extent { } else { return {}; } + // Missing return statement warning from NVCC and ICC +#if defined(__NVCC__) || defined(__INTEL_COMPILER) + return {}; +#endif } template @@ -123,6 +135,10 @@ struct padded_extent { } else { return {}; } + // Missing return statement warning from NVCC and ICC +#if defined(__NVCC__) || defined(__INTEL_COMPILER) + return {}; +#endif } }; } // namespace detail @@ -158,19 +174,21 @@ class layout_left_padded::mapping { typename padded_stride_type::static_array_type padded_stride = {}; extents_type exts = {}; - constexpr index_type compute_offset(std::index_sequence<>) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { return 0; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffset index_offset) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, IndexOffset index_offset) const { return index_offset; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffsets... index_offsets) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { index_type indices[] = {static_cast(index_offsets)...}; // self-recursive fold trick from // @@ -203,7 +221,7 @@ class layout_left_padded::mapping { #endif MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; - MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping& operator=(const mapping&) noexcept = default; /** * Initializes the mapping with the given extents. @@ -241,62 +259,71 @@ class layout_left_padded::mapping { /** * Converting constructor from `layout_left::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; - * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + * This overload participates in overload resolution only if + * `is_constructible_v` is true. If + * `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, + * or `OtherExtents::static_extent(0)` must be `dynamic_extent`; otherwise, + * `OtherExtents::static_extent(0)` must be equal to the least multiple of + * `padding_value` greater than or equal to `extents_type::static_extent(0)` */ MDSPAN_TEMPLATE_REQUIRES( - class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + class _OtherExtents, + /* requires */ (std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - static_assert((_OtherExtents::rank() > 1) || (static_padding_stride != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) - || (static_padding_stride == _OtherExtents::static_extent(extent_to_pad_idx))); + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (static_padding_stride != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (static_padding_stride == + _OtherExtents::static_extent(extent_to_pad_idx))); } /** * Converting constructor from `layout_stride::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true + * This overload participates in overload resolution only if + * `is_constructible_v` is true */ MDSPAN_TEMPLATE_REQUIRES( - class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) + class _OtherExtents, + /* requires */ (std::is_constructible_v)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - } + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) {} /** * Converting constructor from `layout_left_padded::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + * This overload participates in overload resolution only if + * `is_constructible_v` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) - constexpr - mapping(const _Mapping &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { static_assert(padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent || padding_value == _Mapping::padding_value); @@ -305,42 +332,43 @@ class layout_left_padded::mapping { /** * Converting constructor from `layout_right_padded::mapping`. * - * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && extents_type::rank() <= 1 - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) - constexpr - mapping(const _Mapping &other_mapping) noexcept - : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), - exts(other_mapping.extents()) - {} + class _Mapping, + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding( + other_mapping.extents(), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} - constexpr const extents_type &extents() const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { return exts; } - constexpr std::array - strides() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr std::array + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { return {}; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return {1}; } else { index_type value = 1; std::array s{}; s[extent_to_pad_idx] = value; value *= padded_stride.value(0); - for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; ++r) - { + for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; + ++r) { s[r] = value; value *= exts.extent(r); } @@ -349,12 +377,11 @@ class layout_left_padded::mapping { } } - constexpr index_type - required_span_size() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { return 1; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return exts.extent(0); } else { index_type value = padded_stride.value(0); @@ -375,40 +402,51 @@ class layout_left_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class... _Indices, - /* requires */ ( - sizeof...(_Indices) == extents_type::rank() && - (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) - ) - ) - constexpr size_t operator()(_Indices... idxs) const noexcept - { + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { +#if !defined(NDEBUG) + ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::check_all_indices(this->extents(), + idxs...); +#endif // ! NDEBUG return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); } - static constexpr bool is_always_unique() noexcept { return true; } - static constexpr bool is_always_exhaustive() noexcept - { - return (extents_type::rank() <= rank_type(1)) - || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent - && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; } - static constexpr bool is_always_strided() noexcept { return true; } - static constexpr bool is_unique() noexcept { return true; } - constexpr bool is_exhaustive() const noexcept - { - return (extents_type::rank() < 2) - || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; } - static constexpr bool is_strided() noexcept { return true; } - constexpr index_type stride(rank_type r) const noexcept - { + MDSPAN_INLINE_FUNCTION + constexpr index_type stride(rank_type r) const noexcept { assert(r < extents_type::rank()); - if(r == 0) return index_type(1); + if (r == 0) + return index_type(1); index_type value = padded_stride.value(0); - for (rank_type k = 1; k < r; k++) value *= exts.extent(k); + for (rank_type k = 1; k < r; k++) + value *= exts.extent(k); return value; } @@ -416,26 +454,26 @@ class layout_left_padded::mapping { /** * Equality operator between `layout_left_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. * - * \note There is currently a difference from p2642r2, where this function is specified as taking - * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_left_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept - { - // Workaround for some compilers not short-circuiting properly with compile-time checks - // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping bool strides_equal = true; - if constexpr (extents_type::rank() > rank_type(1)) - { - strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); } return (left.extents() == right.extents()) && strides_equal; } @@ -444,20 +482,31 @@ class layout_left_padded::mapping { /** * Inequality operator between `layout_left_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept - { + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { return !(left == right); } #endif + + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + MDSPAN_INLINE_FUNCTION + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; template @@ -490,25 +539,27 @@ class layout_right_padded::mapping { typename padded_stride_type::static_array_type padded_stride = {}; extents_type exts = {}; - constexpr index_type compute_offset(std::index_sequence<>) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { return 0; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffset index_offset) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, IndexOffset index_offset) const { return index_offset; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffsets... index_offsets) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { // self-recursive fold trick from // index_type res = 0; ((res = static_cast(index_offsets) + (Ranks == extent_to_pad_idx ? padded_stride.value(0) - : exts.extent(Ranks)) * + : exts.extent(Ranks)) * res), ...); return res; @@ -533,7 +584,7 @@ class layout_right_padded::mapping { #endif MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; - MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping& operator=(const mapping&) noexcept = default; /** * Initializes the mapping with the given extents. @@ -577,56 +628,62 @@ class layout_right_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + /* requires */ (std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - static_assert((_OtherExtents::rank() > 1) || (padded_stride_type::static_value() != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) - || (padded_stride_type::static_value() == _OtherExtents::static_extent(extent_to_pad_idx))); + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (padded_stride_type::static_value() != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (padded_stride_type::static_value() == + _OtherExtents::static_extent(extent_to_pad_idx))); } /** * Converting constructor from `layout_stride::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true + * This overload participates in overload resolution only if + * `is_constructible_v` is true */ MDSPAN_TEMPLATE_REQUIRES( class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) + /* requires */ (std::is_constructible_v)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - {} + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) {} /** * Converting constructor from `layout_right_padded::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + * This overload participates in overload resolution only if + * `is_constructible_v` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && std::is_constructible_v - ) - ) + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION constexpr mapping(const _Mapping &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { static_assert(padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent || padding_value == _Mapping::padding_value); @@ -635,41 +692,42 @@ class layout_right_padded::mapping { /** * Converting constructor from `layout_left_padded::mapping`. * - * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && extents_type::rank() <= 1 - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const _Mapping &other_mapping) noexcept - : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), - exts(other_mapping.extents()) - {} + : padded_stride(padded_stride_type::init_padding( + other_mapping.extents(), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} - constexpr const extents_type &extents() const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { return exts; } - constexpr std::array - strides() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr std::array + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { return {}; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return {1}; } else { index_type value = 1; std::array s{}; s[extent_to_pad_idx] = value; value *= padded_stride.value(0); - for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) - { + for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) { s[r] = value; value *= exts.extent(r); } @@ -678,17 +736,15 @@ class layout_right_padded::mapping { } } - constexpr index_type - required_span_size() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { return 1; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return exts.extent(0); } else { index_type value = 1; - for (rank_type r = 0; r < extent_to_pad_idx; ++r) - { + for (rank_type r = 0; r < extent_to_pad_idx; ++r) { value *= exts.extent(r); } return value * padded_stride.value(0); @@ -705,40 +761,47 @@ class layout_right_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class... _Indices, - /* requires */ ( - sizeof...(_Indices) == extents_type::rank() && - (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) - ) - ) - constexpr size_t operator()(_Indices... idxs) const noexcept - { + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); } - static constexpr bool is_always_unique() noexcept { return true; } - static constexpr bool is_always_exhaustive() noexcept - { - return (extents_type::rank() <= rank_type(1)) - || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent - && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; } - static constexpr bool is_always_strided() noexcept { return true; } - static constexpr bool is_unique() noexcept { return true; } - constexpr bool is_exhaustive() const noexcept - { - return (extents_type::rank() < 2) - || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; } - static constexpr bool is_strided() noexcept { return true; } - constexpr index_type stride(rank_type r) const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr index_type + stride(rank_type r) const noexcept { assert(r < extents_type::rank()); - if(r == extents_type::rank() - 1) return index_type(1); + if (r == extents_type::rank() - 1) + return index_type(1); index_type value = padded_stride.value(0); - for (rank_type k = extents_type::rank() - 2; k > r; k--) value *= exts.extent(k); + for (rank_type k = extents_type::rank() - 2; k > r; k--) + value *= exts.extent(k); return value; } @@ -746,26 +809,26 @@ class layout_right_padded::mapping { /** * Equality operator between `layout_right_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. * - * \note There is currently a difference from p2642r2, where this function is specified as taking - * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_right_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept - { - // Workaround for some compilers not short-circuiting properly with compile-time checks - // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping bool strides_equal = true; - if constexpr (extents_type::rank() > rank_type(1)) - { - strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); } return (left.extents() == right.extents()) && strides_equal; } @@ -774,20 +837,31 @@ class layout_right_padded::mapping { /** * Inequality operator between `layout_right_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept - { + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { return !(left == right); } #endif + + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + MDSPAN_INLINE_FUNCTION + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; } } diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp index 945f091a2dc..18daa28cc68 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp @@ -17,6 +17,7 @@ #include #include "../__p0009_bits/dynamic_extent.hpp" +#include "../__p0009_bits/utility.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { @@ -82,36 +83,49 @@ struct is_layout_right_padded_mapping<_Mapping, std::enable_if_t::template mapping>::value>> : std::true_type {}; + +template +constexpr void check_padded_layout_converting_constructor_mandates(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<0>) {} + template -constexpr void check_padded_layout_converting_constructor_mandates() +constexpr void check_padded_layout_converting_constructor_mandates(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<1>) {} + +template +constexpr void check_padded_layout_converting_constructor_mandates(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank) { - if constexpr (_LayoutExtentsType::rank() > 1) { - using extents_type = typename _PaddedLayoutMappingType::extents_type; - constexpr auto padding_value = _PaddedLayoutMappingType::padding_value; - constexpr auto idx = layout_padded_constants::extent_to_pad_idx; - if constexpr ((_LayoutExtentsType::static_extent(idx) != dynamic_extent) && - (extents_type::static_extent(idx) != dynamic_extent) && - (padding_value != dynamic_extent)) { - if constexpr (padding_value == 0) { - static_assert(_LayoutExtentsType::static_extent(idx) == 0); - } else { - static_assert( - _LayoutExtentsType::static_extent(idx) % padding_value == 0); - } - } - } + using extents_type = typename _PaddedLayoutMappingType::extents_type; + constexpr auto padding_value = _PaddedLayoutMappingType::padding_value; + constexpr auto idx = layout_padded_constants::extent_to_pad_idx; + + constexpr auto statically_determinable = + (_LayoutExtentsType::static_extent(idx) != dynamic_extent) && + (extents_type::static_extent(idx) != dynamic_extent) && + (padding_value != dynamic_extent); + + static_assert(!statically_determinable || + (padding_value == 0 + ? _LayoutExtentsType::static_extent(idx) == 0 + : _LayoutExtentsType::static_extent(idx) % padding_value == 0), + ""); } template -constexpr void check_padded_layout_converting_constructor_preconditions([[maybe_unused]] const _OtherMapping &other_mapping) { - if constexpr (_ExtentsType::rank() > 1) { - constexpr auto padded_stride_idx = - layout_padded_constants::padded_stride_idx; - constexpr auto extent_to_pad_idx = layout_padded_constants::extent_to_pad_idx; - assert(other_mapping.stride(padded_stride_idx) == other_mapping.extents().extent(extent_to_pad_idx)); - } +constexpr void check_padded_layout_converting_constructor_preconditions(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<0>, + const _OtherMapping&) {} +template +constexpr void check_padded_layout_converting_constructor_preconditions(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<1>, + const _OtherMapping&) {} +template +constexpr void check_padded_layout_converting_constructor_preconditions(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank, + const _OtherMapping &other_mapping) { + constexpr auto padded_stride_idx = + layout_padded_constants::padded_stride_idx; + constexpr auto extent_to_pad_idx = layout_padded_constants::extent_to_pad_idx; + MDSPAN_IMPL_PRECONDITION(other_mapping.stride(padded_stride_idx) == other_mapping.extents().extent(extent_to_pad_idx)); } + + } } } diff --git a/lib/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp b/lib/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp index ac72a1a4e64..4a0e354ffd0 100644 --- a/lib/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp +++ b/lib/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp @@ -38,5 +38,6 @@ #include "../experimental/__p2642_bits/layout_padded.hpp" #include "../experimental/__p2630_bits/submdspan.hpp" #endif +#include "../experimental/__p2389_bits/dims.hpp" #endif // MDSPAN_HPP_ From cfcd7ddfbcc308e75b8608cf6669f2751ed06776 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 11 Sep 2024 09:31:54 -0600 Subject: [PATCH 02/15] Add KOKKOS_ENABLE_ATOMICS_BYPASS --- src/MAKE/OPTIONS/Makefile.kokkos_mpi_only | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MAKE/OPTIONS/Makefile.kokkos_mpi_only b/src/MAKE/OPTIONS/Makefile.kokkos_mpi_only index e1f7005617c..5c39ac8f3e7 100644 --- a/src/MAKE/OPTIONS/Makefile.kokkos_mpi_only +++ b/src/MAKE/OPTIONS/Makefile.kokkos_mpi_only @@ -7,7 +7,7 @@ SHELL = /bin/sh # specify flags and libraries needed for your compiler CC = mpicxx -CCFLAGS = -g -O3 -DNDEBUG +CCFLAGS = -g -O3 -DNDEBUG -DKOKKOS_ENABLE_ATOMICS_BYPASS SHFLAGS = -fPIC # uncomment when compiling with Intel 21.5 or older FMTFLAGS = # -std=c++11 From 64db592a847a28443f6ce2e2e57cb844d0a744b9 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 11 Sep 2024 09:45:21 -0600 Subject: [PATCH 03/15] Update Kokkos version in CMake --- cmake/Modules/Packages/KOKKOS.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index bf0a18d324a..cbda60fc53f 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -45,8 +45,8 @@ if(DOWNLOAD_KOKKOS) list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") include(ExternalProject) - set(KOKKOS_URL "" CACHE STRING "URL for KOKKOS tarball") - set(KOKKOS_MD5 "243de871b3dc2cf3990c1c404032df83" CACHE STRING "MD5 checksum of KOKKOS tarball") + set(KOKKOS_URL "" CACHE STRING "URL for KOKKOS tarball") + set(KOKKOS_MD5 "95af2e2d4b10a67a63cce09715fba127" CACHE STRING "MD5 checksum of KOKKOS tarball") mark_as_advanced(KOKKOS_URL) mark_as_advanced(KOKKOS_MD5) GetFallbackURL(KOKKOS_URL KOKKOS_FALLBACK) @@ -71,7 +71,7 @@ if(DOWNLOAD_KOKKOS) add_dependencies(LAMMPS::KOKKOSCORE kokkos_build) add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build) elseif(EXTERNAL_KOKKOS) - find_package(Kokkos 4.3.01 REQUIRED CONFIG) + find_package(Kokkos 4.4.00 REQUIRED CONFIG) target_link_libraries(lammps PRIVATE Kokkos::kokkos) else() set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos) From dc15e4ae81e34d2eda076afe5a5a42fad9f7c99f Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 12 Sep 2024 10:50:19 -0600 Subject: [PATCH 04/15] Enforce options in CMake --- cmake/Modules/Packages/KOKKOS.cmake | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index cbda60fc53f..8c5b1229b5c 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -10,6 +10,8 @@ endif() if(Kokkos_ENABLE_CUDA) message(STATUS "KOKKOS: Enabling CUDA LAMBDA function support") set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "" FORCE) + message(STATUS "KOKKOS: Disabling CUDA malloc async support") + set(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC OFF CACHE BOOL "" FORCE) endif() # Adding OpenMP compiler flags without the checks done for # BUILD_OMP can result in compile failures. Enforce consistency. @@ -18,6 +20,15 @@ if(Kokkos_ENABLE_OPENMP) message(FATAL_ERROR "Must enable BUILD_OMP with Kokkos_ENABLE_OPENMP") endif() endif() + +if(Kokkos_ENABLE_SERIAL) + if(NOT (Kokkos_ENABLE_OPENMP OR Kokkos_ENABLE_THREADS OR + Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP OR Kokkos_ENABLE_SYCL + OR Kokkos_ENABLE_OPENMPTARGET)) + message(STATUS "KOKKOS: Disabling atomics for Serial Backend") + set(Kokkos_ENABLE_ATOMICS_BYPASS ON CACHE BOOL "" FORCE) + endif() +endif() ######################################################################## option(EXTERNAL_KOKKOS "Build against external kokkos library" OFF) From 7b192282994cfca22fae9f06313f6cc996c33ec8 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 12 Sep 2024 11:30:11 -0600 Subject: [PATCH 05/15] Make realloc_kokkos function safer, suggested in discussion with @weinbe2 --- src/KOKKOS/memory_kokkos.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h index 0c7555875e3..2b136375b21 100644 --- a/src/KOKKOS/memory_kokkos.h +++ b/src/KOKKOS/memory_kokkos.h @@ -327,13 +327,23 @@ void destroy_kokkos(TYPE data, typename TYPE::value_type*** &array) /* ---------------------------------------------------------------------- reallocate Kokkos views without initialization deallocate first to reduce memory use + for the first case, enforce values are given for all dimensions + for the second case, allow zero values given for dimensions ------------------------------------------------------------------------- */ template -static void realloc_kokkos(TYPE &data, const char *name, Indices... ns) +static std::enable_if_t realloc_kokkos(TYPE &data, const char *name, Indices... ns) { data = TYPE(); - data = TYPE(Kokkos::NoInit(std::string(name)), ns...); + data = TYPE(std::string(name), ns...); +} + +template +static std::enable_if_t realloc_kokkos_allow_zero(TYPE &data, const char *name, Indices... ns) +{ + data = TYPE(); + if constexpr (sizeof...(Indices) != 0) + data = TYPE(std::string(name), ns...); } /* ---------------------------------------------------------------------- From 72024e90c987cb98d642a5993368d880c7928806 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 12 Sep 2024 14:18:31 -0600 Subject: [PATCH 06/15] Only check dynamic rank --- src/KOKKOS/memory_kokkos.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h index 2b136375b21..026c8afcb42 100644 --- a/src/KOKKOS/memory_kokkos.h +++ b/src/KOKKOS/memory_kokkos.h @@ -332,14 +332,14 @@ void destroy_kokkos(TYPE data, typename TYPE::value_type*** &array) ------------------------------------------------------------------------- */ template -static std::enable_if_t realloc_kokkos(TYPE &data, const char *name, Indices... ns) +static std::enable_if_t realloc_kokkos(TYPE &data, const char *name, Indices... ns) { data = TYPE(); data = TYPE(std::string(name), ns...); } template -static std::enable_if_t realloc_kokkos_allow_zero(TYPE &data, const char *name, Indices... ns) +static std::enable_if_t realloc_kokkos_allow_zero(TYPE &data, const char *name, Indices... ns) { data = TYPE(); if constexpr (sizeof...(Indices) != 0) From 5699e3c8cf43e6839c636eabfbb60ba148b341ac Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 12 Sep 2024 17:34:12 -0400 Subject: [PATCH 07/15] add overridable settings --- cmake/Modules/Packages/KOKKOS.cmake | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 8c5b1229b5c..08b109eff4a 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -10,8 +10,21 @@ endif() if(Kokkos_ENABLE_CUDA) message(STATUS "KOKKOS: Enabling CUDA LAMBDA function support") set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "" FORCE) - message(STATUS "KOKKOS: Disabling CUDA malloc async support") - set(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC OFF CACHE BOOL "" FORCE) + option(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC "CUDA asynchronous malloc support" OFF) + mark_as_advanced(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC) + if(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC) + message(STATUS "KOKKOS: CUDA malloc async support enabled") + else() + message(STATUS "KOKKOS: CUDA malloc async support disabled") + endif() +endif() +if(Kokkos_ENABLE_HIP) + option(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY "Enable unified memory with HIP" ON) + mark_as_advanced(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) + option(KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS "Enable multiple kernel instantiations with HIP" ON) + mark_as_advanced(KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS) + option(KOKKOS_ENABLE_ROCTHRUST "Use RoCThrust library" ON) + mark_as_advanced(KOKKOS_ENABLE_ROCTHRUST) endif() # Adding OpenMP compiler flags without the checks done for # BUILD_OMP can result in compile failures. Enforce consistency. From 97627bd77ab4adaec43e714c2a341e90c9b25204 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 13 Sep 2024 05:34:15 -0400 Subject: [PATCH 08/15] fix indexing error --- src/KOKKOS/pair_reaxff_kokkos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/pair_reaxff_kokkos.cpp b/src/KOKKOS/pair_reaxff_kokkos.cpp index 7af5889e628..741d7f846ed 100644 --- a/src/KOKKOS/pair_reaxff_kokkos.cpp +++ b/src/KOKKOS/pair_reaxff_kokkos.cpp @@ -578,7 +578,7 @@ void PairReaxFFKokkos::Deallocate_Lookup_Tables() for (i = 0; i <= ntypes; ++i) { if (map[i] == -1) continue; for (j = i; j <= ntypes; ++j) { - if (map[i] == -1) continue; + if (map[j] == -1) continue; if (LR[i][j].n) { sfree(LR[i][j].y); sfree(LR[i][j].H); From 3079d51eaf06fd979801c2470a9ed0498eb62aec Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 13 Sep 2024 05:34:40 -0400 Subject: [PATCH 09/15] enforce that Pair::map is always initialized --- src/REAXFF/pair_reaxff.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/REAXFF/pair_reaxff.cpp b/src/REAXFF/pair_reaxff.cpp index b9f4f6c8385..08e90933b2b 100644 --- a/src/REAXFF/pair_reaxff.cpp +++ b/src/REAXFF/pair_reaxff.cpp @@ -174,6 +174,7 @@ void PairReaxFF::allocate() memory->create(cutsq,n+1,n+1,"pair:cutsq"); memory->create(cutghost,n+1,n+1,"pair:cutghost"); map = new int[n+1]; + for (int i = 0; i <= n; ++i) map[i] = -1; chi = new double[n+1]; eta = new double[n+1]; From 487f7ade68fa09a0ffe288e3bd6f0668e765f061 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 13 Sep 2024 12:14:49 -0600 Subject: [PATCH 10/15] Update Kokkos library in LAMMPS to v4.4.1 --- lib/kokkos/ | 15 ++ lib/kokkos/CMakeLists.txt | 2 +- lib/kokkos/Makefile.kokkos | 2 +- lib/kokkos/cmake/ | 1 + lib/kokkos/cmake/kokkos_enable_options.cmake | 4 +- .../unit_tests/TestWithoutInitializing.hpp | 12 ++ lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 39 ++++- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp | 23 ++- .../core/src/Cuda/Kokkos_Cuda_Instance.cpp | 20 +++ lib/kokkos/core/src/Kokkos_View.hpp | 2 + lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp | 2 +- .../src/OpenMP/Kokkos_OpenMP_Instance.cpp | 39 +++-- .../src/OpenMP/Kokkos_OpenMP_Instance.hpp | 4 +- .../src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp | 3 +- lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp | 45 ++++++ lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp | 25 ++- .../core/src/impl/Kokkos_ViewMapping.hpp | 10 +- lib/kokkos/core/unit_test/TestViewOfViews.hpp | 116 ++++++++++---- .../core/unit_test/cuda/TestCuda_Spaces.cpp | 16 ++ lib/kokkos/master_history.txt | 1 + lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp | 12 +- lib/kokkos/simd/unit_tests/TestSIMD.cpp | 1 + .../include/TestSIMD_Construction.hpp | 150 ++++++++++++++++++ 23 files changed, 467 insertions(+), 77 deletions(-) create mode 100644 lib/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp diff --git a/lib/kokkos/ b/lib/kokkos/ index 78225f9e6c2..7b1d69e5663 100644 --- a/lib/kokkos/ +++ b/lib/kokkos/ @@ -1,5 +1,20 @@ # CHANGELOG +## [4.4.01]( +[Full Changelog]( + +### Features: +* Introduce new SequentialHostInit view allocation property [\#7229]( + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental support for unified memory mode (intended for Grace-Hopper etc.) [\#6823]( + +### Bug Fixes +* OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284]( +* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296]( + ## [4.4.00]( [Full Changelog]( diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 054de2c1dae..736cbac218c 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -151,7 +151,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) set(Kokkos_VERSION_MINOR 4) -set(Kokkos_VERSION_PATCH 0) +set(Kokkos_VERSION_PATCH 1) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index a8e1e803f45..eb059d9b81f 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -12,7 +12,7 @@ endif KOKKOS_VERSION_MAJOR = 4 KOKKOS_VERSION_MINOR = 4 -KOKKOS_VERSION_PATCH = 0 +KOKKOS_VERSION_PATCH = 1 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial diff --git a/lib/kokkos/cmake/ b/lib/kokkos/cmake/ index 7997aa3707c..a93007ff83f 100644 --- a/lib/kokkos/cmake/ +++ b/lib/kokkos/cmake/ @@ -37,6 +37,7 @@ #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC +#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS #cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index b900c4a232e..53764b0c684 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -48,6 +48,8 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda # resolved but we keep the option around a bit longer to be safe. KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") +KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") + KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") @@ -135,7 +137,7 @@ FUNCTION(check_device_specific_options) ENDIF() ENDFUNCTION() -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC) +CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY) CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) diff --git a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp index 7201cd402a9..e8558628dc8 100644 --- a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp +++ b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp @@ -37,6 +37,17 @@ #endif ///@} +/// Some tests are skipped for unified memory space +#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ + if constexpr (std::is_same_v) \ + GTEST_SKIP() << "skipping since unified memory requires additional " \ + "fences"; +#else +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE +#endif + TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); @@ -657,6 +668,7 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) { TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) { GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE + GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 75318aff778..6ae24022c8f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -31,7 +31,6 @@ #include #include -//#include #include #include @@ -178,6 +177,29 @@ void *impl_allocate_common(const int device_id, cudaError_t error_code = cudaSuccess; #ifndef CUDART_VERSION #error CUDART_VERSION undefined! +#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + // This is intended for Grace-Hopper (and future unified memory architectures) + // The idea is to use host allocator and then advise to keep it in HBM on the + // device, but that requires CUDA 12.2 + static_assert(CUDART_VERSION >= 12020, + "CUDA runtime version >=12.2 required when " + "Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set. " + "Please update your CUDA runtime version or " + "reconfigure with " + "-D Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF"); + if (arg_alloc_size) { // cudaMemAdvise_v2 does not work with nullptr + error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); + if (error_code == cudaSuccess) { + // One would think cudaMemLocation{device_id, + // cudaMemLocationTypeDevice} would work but it doesn't. I.e. the order of + // members doesn't seem to be defined. + cudaMemLocation loc; + = device_id; + loc.type = cudaMemLocationTypeDevice; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2( + ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc)); + } + } #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) if (arg_alloc_size >= memory_threshold_g) { error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); @@ -190,9 +212,13 @@ void *impl_allocate_common(const int device_id, "Kokkos::Cuda: backend fence after async malloc"); } } - } else + } else { + error_code = cudaMalloc(&ptr, arg_alloc_size); + } +#else + error_code = cudaMalloc(&ptr, arg_alloc_size); #endif - { error_code = cudaMalloc(&ptr, arg_alloc_size); } + if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an @@ -326,6 +352,9 @@ void CudaSpace::impl_deallocate( } #ifndef CUDART_VERSION #error CUDART_VERSION undefined! +#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) if (arg_alloc_size >= memory_threshold_g) { Impl::cuda_device_synchronize( @@ -436,8 +465,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, #include +#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::CudaSpace); +#else +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace); +#endif KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::CudaUVMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index 0e20193e8b4..e1d062d72d5 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -88,6 +88,19 @@ class CudaSpace { void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; +#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } +#endif + /**\brief Deallocate untracked memory in the cuda space */ void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; void deallocate(const char* arg_label, void* const arg_alloc_ptr, @@ -337,7 +350,11 @@ static_assert( template <> struct MemorySpaceAccess { enum : bool { assignable = false }; - enum : bool { accessible = false }; +#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + enum : bool{accessible = false}; +#else + enum : bool { accessible = true }; +#endif enum : bool { deepcopy = true }; }; @@ -558,8 +575,12 @@ struct DeepCopy #include +namespace { +int g_openmp_hardware_max_threads = 1; +} + namespace Kokkos { namespace Impl { std::vector OpenMPInternal::all_instances; std::mutex OpenMPInternal::all_instances_mutex; +int OpenMPInternal::max_hardware_threads() noexcept { + return g_openmp_hardware_max_threads; +} + void OpenMPInternal::clear_thread_data() { const size_t member_bytes = sizeof(int64_t) * @@ -188,9 +196,9 @@ void OpenMPInternal::initialize(int thread_count) { // Before any other call to OMP query the maximum number of threads // and save the value for re-initialization unit testing. - Impl::g_openmp_hardware_max_threads = get_current_max_threads(); + g_openmp_hardware_max_threads = get_current_max_threads(); - int process_num_threads = Impl::g_openmp_hardware_max_threads; + int process_num_threads = g_openmp_hardware_max_threads; if (Kokkos::hwloc::available()) { process_num_threads = Kokkos::hwloc::get_available_numa_count() * @@ -203,11 +211,11 @@ void OpenMPInternal::initialize(int thread_count) { // process_num_threads if thread_count > 0, set // g_openmp_hardware_max_threads to thread_count if (thread_count < 0) { - thread_count = Impl::g_openmp_hardware_max_threads; + thread_count = g_openmp_hardware_max_threads; } else if (thread_count == 0) { - if (Impl::g_openmp_hardware_max_threads != process_num_threads) { - Impl::g_openmp_hardware_max_threads = process_num_threads; - omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + if (g_openmp_hardware_max_threads != process_num_threads) { + g_openmp_hardware_max_threads = process_num_threads; + omp_set_num_threads(g_openmp_hardware_max_threads); } } else { if (Kokkos::show_warnings() && thread_count > process_num_threads) { @@ -218,16 +226,16 @@ void OpenMPInternal::initialize(int thread_count) { << ", requested thread : " << std::setw(3) << thread_count << std::endl; } - Impl::g_openmp_hardware_max_threads = thread_count; - omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + g_openmp_hardware_max_threads = thread_count; + omp_set_num_threads(g_openmp_hardware_max_threads); } // setup thread local -#pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads) +#pragma omp parallel num_threads(g_openmp_hardware_max_threads) { Impl::SharedAllocationRecord::tracking_enable(); } auto &instance = OpenMPInternal::singleton(); - instance.m_pool_size = Impl::g_openmp_hardware_max_threads; + instance.m_pool_size = g_openmp_hardware_max_threads; // New, unified host thread team data: { @@ -272,10 +280,9 @@ void OpenMPInternal::finalize() { if (this == &singleton()) { auto const &instance = singleton(); // Silence Cuda Warning - const int nthreads = - instance.m_pool_size <= Impl::g_openmp_hardware_max_threads - ? Impl::g_openmp_hardware_max_threads - : instance.m_pool_size; + const int nthreads = instance.m_pool_size <= g_openmp_hardware_max_threads + ? g_openmp_hardware_max_threads + : instance.m_pool_size; (void)nthreads; #pragma omp parallel num_threads(nthreads) @@ -284,7 +291,7 @@ void OpenMPInternal::finalize() { // allow main thread to track Impl::SharedAllocationRecord::tracking_enable(); - Impl::g_openmp_hardware_max_threads = 1; + g_openmp_hardware_max_threads = 1; } m_initialized = false; @@ -307,7 +314,7 @@ void OpenMPInternal::print_configuration(std::ostream &s) const { if (m_initialized) { const int numa_count = 1; - const int core_per_numa = Impl::g_openmp_hardware_max_threads; + const int core_per_numa = g_openmp_hardware_max_threads; const int thread_per_core = 1; s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index f4a0d3e2012..2aed723b18f 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -47,8 +47,6 @@ namespace Impl { class OpenMPInternal; -inline int g_openmp_hardware_max_threads = 1; - struct OpenMPTraits { static constexpr int MAX_THREAD_COUNT = 512; }; @@ -86,6 +84,8 @@ class OpenMPInternal { void clear_thread_data(); + static int max_hardware_threads() noexcept; + int thread_pool_size() const { return m_pool_size; } void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp index a37e1758a26..5937c093ba1 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp @@ -105,7 +105,8 @@ class UniqueToken { /// \brief upper bound for acquired values, i.e. 0 <= value < size() KOKKOS_INLINE_FUNCTION int size() const noexcept { - KOKKOS_IF_ON_HOST((return Kokkos::Impl::g_openmp_hardware_max_threads;)) + KOKKOS_IF_ON_HOST( + (return Kokkos::Impl::OpenMPInternal::max_hardware_threads();)) KOKKOS_IF_ON_DEVICE((return 0;)) } diff --git a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp index 95cb6f619cc..1ade75692f1 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -313,6 +313,51 @@ struct ViewValueFunctor { void destroy_shared_allocation() {} }; + +template +struct ViewValueFunctorSequentialHostInit { + using ExecSpace = typename DeviceType::execution_space; + using MemSpace = typename DeviceType::memory_space; + static_assert(SpaceAccessibility::accessible); + + ValueType* ptr; + size_t n; + + ViewValueFunctorSequentialHostInit() = default; + + ViewValueFunctorSequentialHostInit(ExecSpace const& /*arg_space*/, + ValueType* const arg_ptr, + size_t const arg_n, + std::string /*arg_name*/) + : ptr(arg_ptr), n(arg_n) {} + + ViewValueFunctorSequentialHostInit(ValueType* const arg_ptr, + size_t const arg_n, + std::string /*arg_name*/) + : ptr(arg_ptr), n(arg_n) {} + + void construct_shared_allocation() { + if constexpr (std::is_trivial_v) { + // value-initialization is equivalent to filling with zeros + std::memset(static_cast(ptr), 0, n * sizeof(ValueType)); + } else { + for (size_t i = 0; i < n; ++i) { + new (ptr + i) ValueType(); + } + } + } + + void destroy_shared_allocation() { + if constexpr (std::is_trivially_destructible_v) { + // do nothing, don't bother calling the destructor + } else { + for (size_t i = 0; i < n; ++i) { + (ptr + i)->~ValueType(); + } + } + } +}; + } // namespace Kokkos::Impl #endif // KOKKOS_VIEW_ALLOC_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp index e1b8ba86a5b..379180ae643 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp @@ -23,12 +23,16 @@ namespace Kokkos { namespace Impl { +struct SequentialHostInit_t {}; struct WithoutInitializing_t {}; struct AllowPadding_t {}; template struct is_view_ctor_property : public std::false_type {}; +template <> +struct is_view_ctor_property : public std::true_type {}; + template <> struct is_view_ctor_property : public std::true_type {}; @@ -84,10 +88,10 @@ struct ViewCtorProp> { /* Property flags have constexpr value */ template -struct ViewCtorProp< - std::enable_if_t::value || - std::is_same::value>, - P> { +struct ViewCtorProp || + std::is_same_v || + std::is_same_v>, + P> { ViewCtorProp() = default; ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; @@ -199,6 +203,11 @@ struct ViewCtorProp : public ViewCtorProp... { Kokkos::Impl::has_type::value; static constexpr bool initialize = !Kokkos::Impl::has_type::value; + static constexpr bool sequential_host_init = + Kokkos::Impl::has_type::value; + static_assert(initialize || !sequential_host_init, + "Incompatible WithoutInitializing and SequentialHostInit view " + "alloc properties"); using memory_space = typename var_memory_space::type; using execution_space = typename var_execution_space::type; @@ -251,7 +260,9 @@ auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, (is_view_label::value && !ViewCtorProp::has_label) || (std::is_same_v && - ViewCtorProp::initialize)) { + ViewCtorProp::initialize) || + (std::is_same_v && + !ViewCtorProp::sequential_host_init)) { using NewViewCtorProp = ViewCtorProp; NewViewCtorProp new_view_ctor_prop(view_ctor_prop); static_cast &>(new_view_ctor_prop).value = @@ -299,7 +310,9 @@ struct WithPropertiesIfUnset, Property, Properties...> { (is_view_label::value && !ViewCtorProp::has_label) || (std::is_same_v && - ViewCtorProp::initialize)) { + ViewCtorProp::initialize) || + (std::is_same_v && + !ViewCtorProp::sequential_host_init)) { using NewViewCtorProp = ViewCtorProp; NewViewCtorProp new_view_ctor_prop(view_ctor_prop); static_cast &>(new_view_ctor_prop).value = diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index 8919dccdb7a..10aaa63b7c8 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -2825,10 +2825,12 @@ class ViewMapping< using memory_space = typename Traits::memory_space; static_assert( SpaceAccessibility::accessible); - using value_type = typename Traits::value_type; - using functor_type = - ViewValueFunctor, - value_type>; + using device_type = Kokkos::Device; + using value_type = typename Traits::value_type; + using functor_type = std::conditional_t< + alloc_prop::sequential_host_init, + ViewValueFunctorSequentialHostInit, + ViewValueFunctor>; using record_type = Kokkos::Impl::SharedAllocationRecord; diff --git a/lib/kokkos/core/unit_test/TestViewOfViews.hpp b/lib/kokkos/core/unit_test/TestViewOfViews.hpp index a87c829bb73..1d53bca336d 100644 --- a/lib/kokkos/core/unit_test/TestViewOfViews.hpp +++ b/lib/kokkos/core/unit_test/TestViewOfViews.hpp @@ -20,7 +20,7 @@ namespace { -// User-defined type with a View data member +// User-defined types with a View data member template class S { V v_; @@ -28,48 +28,102 @@ class S { public: template S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} - S() = default; + KOKKOS_DEFAULTED_FUNCTION S() = default; }; template -void test_view_of_views() { +class N { // not default constructible + V v_; + + public: + template + N(std::string label, Extents... extents) : v_(std::move(label), extents...) {} +}; + +template +class H { // constructible and destructible only from on the host side + V v_; + + public: + template + H(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + H() {} + ~H() {} +}; + +template +void test_view_of_views_default() { + // assigning a default-constructed view to destruct the inner objects using VoV = Kokkos::View; - { // assigning a default-constructed view to destruct the inner objects - VoV vov("vov", 2, 3); - V a("a"); - V b("b"); - vov(0, 0) = a; - vov(1, 0) = a; - vov(0, 1) = b; + VoV vov("vov", 2, 3); + V a("a"); + V b("b"); + vov(0, 0) = a; + vov(1, 0) = a; + vov(0, 1) = b; #ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND - vov(0, 0) = V(); - vov(1, 0) = V(); - vov(0, 1) = V(); + vov(0, 0) = V(); + vov(1, 0) = V(); + vov(0, 1) = V(); #endif - } - { // using placement new to construct the inner objects and explicitly - // calling the destructor - VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3); - V a("a"); - V b("b"); - new (&vov(0, 0)) V(a); - new (&vov(1, 0)) V(a); - new (&vov(0, 1)) V(b); +} + +template +void test_view_of_views_without_initializing() { + // using placement new to construct the inner objects and explicitly + // calling the destructor + using VoV = Kokkos::View; + VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3); + V a("a"); + V b("b"); + new (&vov(0, 0)) V(a); + new (&vov(1, 0)) V(a); + new (&vov(0, 1)) V(b); #ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND - vov(0, 0).~V(); - vov(1, 0).~V(); - vov(0, 1).~V(); + vov(0, 0).~V(); + vov(1, 0).~V(); + vov(0, 1).~V(); #else - // leaks memory + // leaks memory #endif - } } -TEST(TEST_CATEGORY, view_of_views) { - test_view_of_views>(); - test_view_of_views>(); +template +void test_view_of_views_sequential_host_init() { + // inner views value-initialized sequentially on the host, and also + // sequentially destructed on the host, without the need to cleanup + using VoV = Kokkos::View; + VoV vov(Kokkos::view_alloc("vov", Kokkos::SequentialHostInit), 2, 3); + V a("a"); + V b("b"); + vov(0, 0) = a; + vov(1, 0) = a; + vov(0, 1) = b; +} + +TEST(TEST_CATEGORY, view_of_views_default) { + test_view_of_views_default>(); + test_view_of_views_default>(); // User-defined type with View data member - test_view_of_views>>(); + test_view_of_views_default>>(); +} + +TEST(TEST_CATEGORY, view_of_views_without_initializing) { + test_view_of_views_without_initializing>(); + test_view_of_views_without_initializing< + S>>(); + test_view_of_views_without_initializing< + N>>(); + test_view_of_views_without_initializing< + H>>(); +} + +TEST(TEST_CATEGORY, test_view_of_views_sequential_host_init) { + test_view_of_views_sequential_host_init>(); + test_view_of_views_sequential_host_init< + S>>(); + test_view_of_views_sequential_host_init< + H>>(); } } // namespace diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp index 11fe6b8555b..f40af99e7c2 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -39,9 +39,14 @@ TEST(cuda, space_access) { !Kokkos::Impl::MemorySpaceAccess::assignable); +#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible); +#else + static_assert(Kokkos::Impl::MemorySpaceAccess::accessible); +#endif static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible); +#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY static_assert(!Kokkos::SpaceAccessibility::accessible); +#else + static_assert(Kokkos::SpaceAccessibility::accessible); +#endif static_assert(Kokkos::SpaceAccessibility::accessible); @@ -157,8 +167,14 @@ TEST(cuda, space_access) { Kokkos::SpaceAccessibility::accessible); +#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY static_assert(std::is_same::Space, Kokkos::HostSpace>::value); +#else + static_assert(std::is_same::Space, + Kokkos::Device>::value); +#endif static_assert( std::is_same::Space, diff --git a/lib/kokkos/master_history.txt b/lib/kokkos/master_history.txt index a0e83bef237..f2a41636101 100644 --- a/lib/kokkos/master_history.txt +++ b/lib/kokkos/master_history.txt @@ -38,3 +38,4 @@ tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a tag: 4.3.00 date: 04:03:2024 master: e0dc0128 release: f08217a4 tag: 4.3.01 date: 05:07:2024 master: 486cc745 release: 262d2d6e tag: 4.4.00 date: 08:08:2024 master: 6ecdf605 release: 6068673c +tag: 4.4.01 date: 09:12:2024 master: 08ceff92 release: 2d60c039 diff --git a/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp b/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp index 27c8af79abd..0525dc8887a 100644 --- a/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp @@ -361,9 +361,7 @@ class simd_mask> { }; using value_type = bool; using abi_type = simd_abi::avx2_fixed_size<4>; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) : m_value(_mm_set1_epi32(-std::int32_t(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { @@ -460,9 +458,7 @@ class simd_mask> { }; using value_type = bool; using abi_type = simd_abi::avx2_fixed_size<8>; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) : m_value(_mm256_set1_epi32(-std::int32_t(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { @@ -561,9 +557,7 @@ class simd_mask> { }; using value_type = bool; using abi_type = simd_abi::avx2_fixed_size<4>; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) : m_value(_mm256_set1_epi64x(-std::int64_t(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { diff --git a/lib/kokkos/simd/unit_tests/TestSIMD.cpp b/lib/kokkos/simd/unit_tests/TestSIMD.cpp index 7a1f9be2a0f..df18b43c4e3 100644 --- a/lib/kokkos/simd/unit_tests/TestSIMD.cpp +++ b/lib/kokkos/simd/unit_tests/TestSIMD.cpp @@ -22,3 +22,4 @@ #include #include #include +#include diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp new file mode 100644 index 00000000000..0ceb1496c47 --- /dev/null +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TEST_SIMD_CONSTRUCTION_HPP +#define KOKKOS_TEST_SIMD_CONSTRUCTION_HPP + +#include +#include + +template +inline void host_test_simd_traits() { + using simd_type = Kokkos::Experimental::simd; + + static_assert(std::is_nothrow_default_constructible_v); + static_assert(std::is_nothrow_copy_assignable_v); + static_assert(std::is_nothrow_copy_constructible_v); + static_assert(std::is_nothrow_move_assignable_v); + static_assert(std::is_nothrow_move_constructible_v); + + simd_type default_simd, result; + simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + simd_type copy_simd(test_simd); + simd_type move_simd(std::move(copy_simd)); + default_simd = std::move(move_simd); + result = default_simd; + EXPECT_TRUE(all_of(test_simd == result)); +} + +template +inline void host_test_mask_traits() { + using mask_type = Kokkos::Experimental::simd_mask; + + static_assert(std::is_nothrow_default_constructible_v); + static_assert(std::is_nothrow_copy_assignable_v); + static_assert(std::is_nothrow_copy_constructible_v); + static_assert(std::is_nothrow_move_assignable_v); + static_assert(std::is_nothrow_move_constructible_v); + + mask_type default_mask, result; + mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + mask_type copy_mask(test_mask); + mask_type move_mask(std::move(copy_mask)); + default_mask = std::move(move_mask); + result = default_mask; + EXPECT_EQ(test_mask, result); +} + +template +inline void host_check_construction() { + if constexpr (is_type_v>) { + host_test_simd_traits(); + host_test_mask_traits(); + } +} + +template +inline void host_check_construction_all_types( + Kokkos::Experimental::Impl::data_types) { + (host_check_construction(), ...); +} + +template +inline void host_check_construction_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (host_check_construction_all_types(DataTypes()), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_test_simd_traits() { + using simd_type = Kokkos::Experimental::simd; + + simd_type default_simd, result; + simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + simd_type copy_simd(test_simd); + simd_type move_simd(std::move(copy_simd)); + default_simd = std::move(move_simd); + result = default_simd; + + kokkos_checker checker; + checker.truth(all_of(test_simd == result)); +} + +template +KOKKOS_INLINE_FUNCTION void device_test_mask_traits() { + using mask_type = Kokkos::Experimental::simd_mask; + + mask_type default_mask, result; + mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + mask_type copy_mask(test_mask); + mask_type move_mask(std::move(copy_mask)); + default_mask = std::move(move_mask); + result = default_mask; + + kokkos_checker checker; + checker.truth(test_mask == result); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_construction() { + if constexpr (is_type_v>) { + device_test_simd_traits(); + device_test_mask_traits(); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_construction_all_types( + Kokkos::Experimental::Impl::data_types) { + (device_check_construction(), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_construction_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (device_check_construction_all_types(DataTypes()), ...); +} + +class simd_device_construction_functor { + public: + KOKKOS_INLINE_FUNCTION void operator()(int) const { + device_check_construction_all_abis( + Kokkos::Experimental::Impl::device_abi_set()); + } +}; + +TEST(simd, host_construction) { + host_check_construction_all_abis(Kokkos::Experimental::Impl::host_abi_set()); +} + +TEST(simd, device_construction) { + Kokkos::parallel_for(Kokkos::RangePolicy>(0, 1), + simd_device_construction_functor()); +} + +#endif From 5075c7cfa135eeb401e2b35ab35385b160e476ff Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 13 Sep 2024 12:16:58 -0600 Subject: [PATCH 11/15] Update CMake --- cmake/Modules/Packages/KOKKOS.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 08b109eff4a..691be4cd675 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -69,8 +69,8 @@ if(DOWNLOAD_KOKKOS) list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") include(ExternalProject) - set(KOKKOS_URL "" CACHE STRING "URL for KOKKOS tarball") - set(KOKKOS_MD5 "95af2e2d4b10a67a63cce09715fba127" CACHE STRING "MD5 checksum of KOKKOS tarball") + set(KOKKOS_URL "" CACHE STRING "URL for KOKKOS tarball") + set(KOKKOS_MD5 "de6ee80d00b6212b02bfb7f1e71a8392" CACHE STRING "MD5 checksum of KOKKOS tarball") mark_as_advanced(KOKKOS_URL) mark_as_advanced(KOKKOS_MD5) GetFallbackURL(KOKKOS_URL KOKKOS_FALLBACK) @@ -95,7 +95,7 @@ if(DOWNLOAD_KOKKOS) add_dependencies(LAMMPS::KOKKOSCORE kokkos_build) add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build) elseif(EXTERNAL_KOKKOS) - find_package(Kokkos 4.4.00 REQUIRED CONFIG) + find_package(Kokkos 4.4.01 REQUIRED CONFIG) target_link_libraries(lammps PRIVATE Kokkos::kokkos) else() set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos) From 2429c89eae2d6f680c3b454911f4da9e76f1fadd Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 23 Sep 2024 14:59:51 -0600 Subject: [PATCH 12/15] Fix deadlock by always deallocating views of views in serial --- src/KOKKOS/pair_reaxff_kokkos.cpp | 15 ++++++++++++--- src/KOKKOS/pair_reaxff_kokkos.h | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/KOKKOS/pair_reaxff_kokkos.cpp b/src/KOKKOS/pair_reaxff_kokkos.cpp index 741d7f846ed..d3c42146082 100644 --- a/src/KOKKOS/pair_reaxff_kokkos.cpp +++ b/src/KOKKOS/pair_reaxff_kokkos.cpp @@ -105,7 +105,16 @@ PairReaxFFKokkos::~PairReaxFFKokkos() memoryKK->destroy_kokkos(k_tmpbo,tmpbo); tmpbo = nullptr; - // deallocate views of views in serial to prevent race condition in profiling tools + deallocate_views_of_views(); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairReaxFFKokkos::deallocate_views_of_views() +{ + + // deallocate views of views in serial to prevent race conditions for (int i = 0; i < (int)k_LR.extent(0); i++) { for (int j = 0; j < (int)k_LR.extent(1); j++) { @@ -409,8 +418,8 @@ void PairReaxFFKokkos::init_md() int ntypes = atom->ntypes; Init_Lookup_Tables(); + deallocate_views_of_views(); k_LR = tdual_LR_lookup_table_kk_2d("lookup:LR",ntypes+1,ntypes+1); - d_LR = k_LR.template view(); for (int i = 1; i <= ntypes; ++i) { if (map[i] == -1) continue; @@ -1392,7 +1401,7 @@ void PairReaxFFKokkos::operator()(TagPairReaxComputeTabulatedLJCoulo const int tmin = MIN(itype, jtype); const int tmax = MAX(itype, jtype); - const LR_lookup_table_kk& t = d_LR(tmin,tmax); + const LR_lookup_table_kk& t = k_LR.template view()(tmin,tmax); /* Cubic Spline Interpolation */ diff --git a/src/KOKKOS/pair_reaxff_kokkos.h b/src/KOKKOS/pair_reaxff_kokkos.h index 0fe47fcba8e..4c7127c17b1 100644 --- a/src/KOKKOS/pair_reaxff_kokkos.h +++ b/src/KOKKOS/pair_reaxff_kokkos.h @@ -384,6 +384,7 @@ class PairReaxFFKokkos : public PairReaxFF { F_FLOAT *fi, F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *dril, F_FLOAT *drjl, F_FLOAT *drkl) const; protected: + void deallocate_views_of_views(); void allocate(); void allocate_array(); void setup(); @@ -497,7 +498,6 @@ class PairReaxFFKokkos : public PairReaxFF { typedef typename tdual_LR_lookup_table_kk_2d::t_dev t_LR_lookup_table_kk_2d; tdual_LR_lookup_table_kk_2d k_LR; - t_LR_lookup_table_kk_2d d_LR; DAT::tdual_int_2d k_tmpid; DAT::tdual_ffloat_2d k_tmpbo; From 2e05cfeea915891b245018a429dc456160da3ab8 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 23 Sep 2024 15:12:30 -0600 Subject: [PATCH 13/15] Small code cleanup --- src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 20 +++++++++---------- src/KOKKOS/pair_pace_extrapolation_kokkos.h | 2 ++ src/KOKKOS/pair_pace_kokkos.cpp | 20 +++++++++---------- src/KOKKOS/pair_pace_kokkos.h | 2 ++ 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp index e7d376c8707..58ba8d6c1a0 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp @@ -84,7 +84,15 @@ PairPACEExtrapolationKokkos::~PairPACEExtrapolationKokkos() memoryKK->destroy_kokkos(k_eatom,eatom); memoryKK->destroy_kokkos(k_vatom,vatom); - // deallocate views of views in serial to prevent issues in Kokkos tools + deallocate_views_of_views(); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::deallocate_views_of_views() +{ + // deallocate views of views in serial to prevent race conditions if ( { for (int i = 0; i < nelements; i++) { @@ -244,15 +252,7 @@ void PairPACEExtrapolationKokkos::copy_splines() { auto basis_set = aceimpl->basis_set; - if ( { - for (int i = 0; i < nelements; i++) { - for (int j = 0; j < nelements; j++) { - k_splines_gk.h_view(i, j).deallocate(); - k_splines_rnl.h_view(i, j).deallocate(); - k_splines_hc.h_view(i, j).deallocate(); - } - } - } + deallocate_views_of_views(); k_splines_gk = Kokkos::DualView("pace:splines_gk", nelements, nelements); k_splines_rnl = Kokkos::DualView("pace:splines_rnl", nelements, nelements); diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h index df8a0c1740c..c1c1debd45b 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h @@ -296,6 +296,8 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { t_ace_3d3 f_ij; + void deallocate_views_of_views(); + public: struct SplineInterpolatorKokkos { int ntot, nlut, num_of_functions; diff --git a/src/KOKKOS/pair_pace_kokkos.cpp b/src/KOKKOS/pair_pace_kokkos.cpp index 4407d1231ef..8d05c26239e 100644 --- a/src/KOKKOS/pair_pace_kokkos.cpp +++ b/src/KOKKOS/pair_pace_kokkos.cpp @@ -84,7 +84,15 @@ PairPACEKokkos::~PairPACEKokkos() memoryKK->destroy_kokkos(k_eatom,eatom); memoryKK->destroy_kokkos(k_vatom,vatom); - // deallocate views of views in serial to prevent issues in Kokkos tools + deallocate_views_of_views(); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEKokkos::deallocate_views_of_views() +{ + // deallocate views of views in serial to prevent race conditions if ( { for (int i = 0; i < nelements; i++) { @@ -240,15 +248,7 @@ void PairPACEKokkos::copy_splines() { auto basis_set = aceimpl->basis_set; - if ( { - for (int i = 0; i < nelements; i++) { - for (int j = 0; j < nelements; j++) { - k_splines_gk.h_view(i, j).deallocate(); - k_splines_rnl.h_view(i, j).deallocate(); - k_splines_hc.h_view(i, j).deallocate(); - } - } - } + deallocate_views_of_views(); k_splines_gk = Kokkos::DualView("pace:splines_gk", nelements, nelements); k_splines_rnl = Kokkos::DualView("pace:splines_rnl", nelements, nelements); diff --git a/src/KOKKOS/pair_pace_kokkos.h b/src/KOKKOS/pair_pace_kokkos.h index e22c61f0ea5..6b43e52614e 100644 --- a/src/KOKKOS/pair_pace_kokkos.h +++ b/src/KOKKOS/pair_pace_kokkos.h @@ -283,6 +283,8 @@ class PairPACEKokkos : public PairPACE { t_ace_3d3 f_ij; + void deallocate_views_of_views(); + public: struct SplineInterpolatorKokkos { int ntot, nlut, num_of_functions; From e95de835c05cf3d43d78677b57b8e759f0e6d8a4 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 23 Sep 2024 15:38:29 -0600 Subject: [PATCH 14/15] Tweak build defaults --- cmake/Modules/Packages/KOKKOS.cmake | 21 +++++++++++---------- lib/kokkos/Makefile.kokkos | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 691be4cd675..adb3abab6bf 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -8,8 +8,6 @@ endif() ######################################################################## # consistency checks and Kokkos options/settings required by LAMMPS if(Kokkos_ENABLE_CUDA) - message(STATUS "KOKKOS: Enabling CUDA LAMBDA function support") - set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "" FORCE) option(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC "CUDA asynchronous malloc support" OFF) mark_as_advanced(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC) if(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC) @@ -19,12 +17,15 @@ if(Kokkos_ENABLE_CUDA) endif() endif() if(Kokkos_ENABLE_HIP) - option(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY "Enable unified memory with HIP" ON) - mark_as_advanced(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) - option(KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS "Enable multiple kernel instantiations with HIP" ON) - mark_as_advanced(KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS) - option(KOKKOS_ENABLE_ROCTHRUST "Use RoCThrust library" ON) - mark_as_advanced(KOKKOS_ENABLE_ROCTHRUST) + option(Kokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS "Enable multiple kernel instantiations with HIP" ON) + mark_as_advanced(Kokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS) + option(Kokkos_ENABLE_ROCTHRUST "Use RoCThrust library" ON) + mark_as_advanced(Kokkos_ENABLE_ROCTHRUST) + + if(Kokkos_ARCH_AMD_GFX942 OR Kokkos_ARCH_AMD_GFX940) + option(Kokkos_ENABLE_IMPL_HIP_UNIFIED_MEMORY "Enable unified memory with HIP" ON) + mark_as_advanced(Kokkos_ENABLE_IMPL_HIP_UNIFIED_MEMORY) + endif() endif() # Adding OpenMP compiler flags without the checks done for # BUILD_OMP can result in compile failures. Enforce consistency. @@ -38,8 +39,8 @@ if(Kokkos_ENABLE_SERIAL) if(NOT (Kokkos_ENABLE_OPENMP OR Kokkos_ENABLE_THREADS OR Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP OR Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET)) - message(STATUS "KOKKOS: Disabling atomics for Serial Backend") - set(Kokkos_ENABLE_ATOMICS_BYPASS ON CACHE BOOL "" FORCE) + option(Kokkos_ENABLE_ATOMICS_BYPASS "Disable atomics for Kokkos Serial Backend" ON) + mark_as_advanced(Kokkos_ENABLE_ATOMICS_BYPASS) endif() endif() ######################################################################## diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index eb059d9b81f..eb95c5448d6 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -41,7 +41,7 @@ KOKKOS_STANDALONE_CMAKE ?= "no" # Default settings specific options. # Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async -KOKKOS_CUDA_OPTIONS ?= "enable_lambda,disable_malloc_async" +KOKKOS_CUDA_OPTIONS ?= "disable_malloc_async" # Options: rdc KOKKOS_HIP_OPTIONS ?= "" From 7e31a4f482014781fa4eb41e92205e151e2ec00e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 23 Sep 2024 16:14:06 -0600 Subject: [PATCH 15/15] Simplify view deallocation code --- src/KOKKOS/fix_acks2_reaxff_kokkos.cpp | 4 +- src/KOKKOS/fix_qeq_reaxff_kokkos.cpp | 2 +- src/KOKKOS/fix_shake_kokkos.cpp | 4 +- src/KOKKOS/meam_dens_init_kokkos.h | 26 +++++------ src/KOKKOS/meam_force_kokkos.h | 6 +-- src/KOKKOS/pair_adp_kokkos.cpp | 12 ++--- src/KOKKOS/pair_dpd_ext_kokkos.cpp | 6 +-- src/KOKKOS/pair_dpd_ext_tstat_kokkos.cpp | 4 +- src/KOKKOS/pair_dpd_kokkos.cpp | 6 +-- src/KOKKOS/pair_dpd_tstat_kokkos.cpp | 4 +- src/KOKKOS/pair_eam_alloy_kokkos.cpp | 8 ++-- src/KOKKOS/pair_eam_fs_kokkos.cpp | 8 ++-- src/KOKKOS/pair_eam_kokkos.cpp | 8 ++-- src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 4 +- src/KOKKOS/pair_pace_kokkos.cpp | 4 +- src/KOKKOS/pair_reaxff_kokkos.cpp | 44 +++++++++---------- src/KOKKOS/pair_snap_kokkos_impl.h | 4 +- src/KOKKOS/pair_sw_kokkos.cpp | 6 +-- src/KOKKOS/pair_tersoff_kokkos.cpp | 6 +-- src/KOKKOS/pair_tersoff_mod_kokkos.cpp | 6 +-- src/KOKKOS/pair_tersoff_zbl_kokkos.cpp | 6 +-- 21 files changed, 89 insertions(+), 89 deletions(-) diff --git a/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp b/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp index c0b263d7364..bfcb66e5252 100644 --- a/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp @@ -365,7 +365,7 @@ void FixACKS2ReaxFFKokkos::pre_force(int /*vflag*/) // free duplicated memory - dup_X_diag = decltype(dup_X_diag)(); + dup_X_diag = {}; } if (neighflag != FULL) { @@ -1419,7 +1419,7 @@ void FixACKS2ReaxFFKokkos::sparse_matvec_acks2(typename AT::t_ffloat // free duplicated memory - dup_bb = decltype(dup_bb)(); + dup_bb = {}; } } diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index f93f6cb70ed..7ef4505b068 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -291,7 +291,7 @@ void FixQEqReaxFFKokkos::pre_force(int /*vflag*/) // free duplicated memory if (need_dup) - dup_o = decltype(dup_o)(); + dup_o = {}; atomKK->modified(execution_space,datamask_modify); diff --git a/src/KOKKOS/fix_shake_kokkos.cpp b/src/KOKKOS/fix_shake_kokkos.cpp index 47f932d8f2d..52826d7b04e 100644 --- a/src/KOKKOS/fix_shake_kokkos.cpp +++ b/src/KOKKOS/fix_shake_kokkos.cpp @@ -459,8 +459,8 @@ void FixShakeKokkos::post_force(int vflag) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/meam_dens_init_kokkos.h b/src/KOKKOS/meam_dens_init_kokkos.h index 68e69430fd9..dd63be96bdb 100644 --- a/src/KOKKOS/meam_dens_init_kokkos.h +++ b/src/KOKKOS/meam_dens_init_kokkos.h @@ -294,20 +294,20 @@ MEAMKokkos::meam_dens_init(int inum_half, int ntype, typename AT::t_ Kokkos::Experimental::contribute(d_arho3mb, dup_arho3mb); // free duplicated memory - dup_rho0 = decltype(dup_rho0)(); - dup_arho2b = decltype(dup_arho2b)(); - dup_arho1 = decltype(dup_arho1)(); - dup_arho2 = decltype(dup_arho2)(); - dup_arho3 = decltype(dup_arho3)(); - dup_arho3b = decltype(dup_arho3b)(); - dup_t_ave = decltype(dup_t_ave)(); - dup_tsq_ave = decltype(dup_tsq_ave)(); + dup_rho0 = {}; + dup_arho2b = {}; + dup_arho1 = {}; + dup_arho2 = {}; + dup_arho3 = {}; + dup_arho3b = {}; + dup_t_ave = {}; + dup_tsq_ave = {}; // msmeam - dup_arho2mb = decltype(dup_arho2mb)(); - dup_arho1m = decltype(dup_arho1m)(); - dup_arho2m = decltype(dup_arho2m)(); - dup_arho3m = decltype(dup_arho3m)(); - dup_arho3mb = decltype(dup_arho3mb)(); + dup_arho2mb = {}; + dup_arho1m = {}; + dup_arho2m = {}; + dup_arho3m = {}; + dup_arho3mb = {}; } } diff --git a/src/KOKKOS/meam_force_kokkos.h b/src/KOKKOS/meam_force_kokkos.h index a546ab54d41..1875e22dcfe 100644 --- a/src/KOKKOS/meam_force_kokkos.h +++ b/src/KOKKOS/meam_force_kokkos.h @@ -75,9 +75,9 @@ void MEAMKokkos::meam_force( if (vflag_atom) Kokkos::Experimental::contribute(d_vatom, dup_vatom); // free duplicated memory - dup_f = decltype(dup_f)(); - if (eflag_atom) dup_eatom = decltype(dup_eatom)(); - if (vflag_atom) dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + if (eflag_atom) dup_eatom = {}; + if (vflag_atom) dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_adp_kokkos.cpp b/src/KOKKOS/pair_adp_kokkos.cpp index 1297d626519..999a67ca49c 100644 --- a/src/KOKKOS/pair_adp_kokkos.cpp +++ b/src/KOKKOS/pair_adp_kokkos.cpp @@ -297,12 +297,12 @@ void PairADPKokkos::compute(int eflag_in, int vflag_in) // free duplicated memory if (need_dup) { - dup_rho = decltype(dup_rho)(); - dup_mu = decltype(dup_mu)(); - dup_lambda = decltype(dup_lambda)(); - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_rho = {}; + dup_mu = {}; + dup_lambda = {}; + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_dpd_ext_kokkos.cpp b/src/KOKKOS/pair_dpd_ext_kokkos.cpp index 636235d1c81..95c9d304f3f 100644 --- a/src/KOKKOS/pair_dpd_ext_kokkos.cpp +++ b/src/KOKKOS/pair_dpd_ext_kokkos.cpp @@ -207,9 +207,9 @@ void PairDPDExtKokkos::compute(int eflagin, int vflagin) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_dpd_ext_tstat_kokkos.cpp b/src/KOKKOS/pair_dpd_ext_tstat_kokkos.cpp index 213b344fbba..91d1183957d 100644 --- a/src/KOKKOS/pair_dpd_ext_tstat_kokkos.cpp +++ b/src/KOKKOS/pair_dpd_ext_tstat_kokkos.cpp @@ -212,8 +212,8 @@ void PairDPDExtTstatKokkos::compute(int eflagin, int vflagin) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_dpd_kokkos.cpp b/src/KOKKOS/pair_dpd_kokkos.cpp index f888b5f6ce1..0ebf8ccae06 100644 --- a/src/KOKKOS/pair_dpd_kokkos.cpp +++ b/src/KOKKOS/pair_dpd_kokkos.cpp @@ -207,9 +207,9 @@ void PairDPDKokkos::compute(int eflagin, int vflagin) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_dpd_tstat_kokkos.cpp b/src/KOKKOS/pair_dpd_tstat_kokkos.cpp index 63dbda3b59e..d51cce629a3 100644 --- a/src/KOKKOS/pair_dpd_tstat_kokkos.cpp +++ b/src/KOKKOS/pair_dpd_tstat_kokkos.cpp @@ -211,8 +211,8 @@ void PairDPDTstatKokkos::compute(int eflagin, int vflagin) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_eam_alloy_kokkos.cpp b/src/KOKKOS/pair_eam_alloy_kokkos.cpp index b02faced1e6..90a82616a67 100644 --- a/src/KOKKOS/pair_eam_alloy_kokkos.cpp +++ b/src/KOKKOS/pair_eam_alloy_kokkos.cpp @@ -309,10 +309,10 @@ void PairEAMAlloyKokkos::compute(int eflag_in, int vflag_in) // free duplicated memory if (need_dup) { - dup_rho = decltype(dup_rho)(); - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_rho = {}; + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_eam_fs_kokkos.cpp b/src/KOKKOS/pair_eam_fs_kokkos.cpp index 4da146e68e2..11719a89790 100644 --- a/src/KOKKOS/pair_eam_fs_kokkos.cpp +++ b/src/KOKKOS/pair_eam_fs_kokkos.cpp @@ -309,10 +309,10 @@ void PairEAMFSKokkos::compute(int eflag_in, int vflag_in) // free duplicated memory if (need_dup) { - dup_rho = decltype(dup_rho)(); - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_rho = {}; + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_eam_kokkos.cpp b/src/KOKKOS/pair_eam_kokkos.cpp index 54ffa84f2d3..1e870555dcd 100644 --- a/src/KOKKOS/pair_eam_kokkos.cpp +++ b/src/KOKKOS/pair_eam_kokkos.cpp @@ -304,10 +304,10 @@ void PairEAMKokkos::compute(int eflag_in, int vflag_in) // free duplicated memory if (need_dup) { - dup_rho = decltype(dup_rho)(); - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_rho = {}; + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp index 58ba8d6c1a0..746055f28c7 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp @@ -808,8 +808,8 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_pace_kokkos.cpp b/src/KOKKOS/pair_pace_kokkos.cpp index 8d05c26239e..0afbb7540e7 100644 --- a/src/KOKKOS/pair_pace_kokkos.cpp +++ b/src/KOKKOS/pair_pace_kokkos.cpp @@ -753,8 +753,8 @@ void PairPACEKokkos::compute(int eflag_in, int vflag_in) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_reaxff_kokkos.cpp b/src/KOKKOS/pair_reaxff_kokkos.cpp index d3c42146082..b0a53a27fd8 100644 --- a/src/KOKKOS/pair_reaxff_kokkos.cpp +++ b/src/KOKKOS/pair_reaxff_kokkos.cpp @@ -118,10 +118,10 @@ void PairReaxFFKokkos::deallocate_views_of_views() for (int i = 0; i < (int)k_LR.extent(0); i++) { for (int j = 0; j < (int)k_LR.extent(1); j++) { - k_LR.h_view(i,j).d_vdW = decltype(k_LR.h_view(i,j).d_vdW )(); - k_LR.h_view(i,j).d_CEvd = decltype(k_LR.h_view(i,j).d_CEvd )(); - k_LR.h_view(i,j).d_ele = decltype(k_LR.h_view(i,j).d_ele )(); - k_LR.h_view(i,j).d_CEclmb = decltype(k_LR.h_view(i,j).d_CEclmb)(); + k_LR.h_view(i,j).d_vdW = {}; + k_LR.h_view(i,j).d_CEvd = {}; + k_LR.h_view(i,j).d_ele = {}; + k_LR.h_view(i,j).d_CEclmb = {}; } } } @@ -1101,19 +1101,19 @@ void PairReaxFFKokkos::compute(int eflag_in, int vflag_in) // free scatterview memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); - dup_dDeltap_self = decltype(dup_dDeltap_self)(); - dup_total_bo = decltype(dup_total_bo)(); - dup_CdDelta = decltype(dup_CdDelta)(); + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; + dup_dDeltap_self = {}; + dup_total_bo = {}; + dup_CdDelta = {}; } else { - ndup_f = decltype(ndup_f)(); - ndup_eatom = decltype(ndup_eatom)(); - ndup_vatom = decltype(ndup_vatom)(); - ndup_dDeltap_self = decltype(ndup_dDeltap_self)(); - ndup_total_bo = decltype(ndup_total_bo)(); - ndup_CdDelta = decltype(ndup_CdDelta)(); + ndup_f = {}; + ndup_eatom = {}; + ndup_vatom = {}; + ndup_dDeltap_self = {}; + ndup_total_bo = {}; + ndup_CdDelta = {}; } d_neighbors = typename AT::t_neighbors_2d(); @@ -1501,13 +1501,13 @@ void PairReaxFFKokkos::allocate_array() { // free scatterview memory if (need_dup) { - dup_dDeltap_self = decltype(dup_dDeltap_self)(); - dup_total_bo = decltype(dup_total_bo)(); - dup_CdDelta = decltype(dup_CdDelta)(); + dup_dDeltap_self = {}; + dup_total_bo = {}; + dup_CdDelta = {}; } else { - ndup_dDeltap_self = decltype(ndup_dDeltap_self)(); - ndup_total_bo = decltype(ndup_total_bo)(); - ndup_CdDelta = decltype(ndup_CdDelta)(); + ndup_dDeltap_self = {}; + ndup_total_bo = {}; + ndup_CdDelta = {}; } if (cut_hbsq > 0.0) { diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 839240c62f3..97c7d17ea92 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -525,8 +525,8 @@ void PairSNAPKokkos::compute(int eflag_in, // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_sw_kokkos.cpp b/src/KOKKOS/pair_sw_kokkos.cpp index 01b856a7b5c..d62af5a78f9 100644 --- a/src/KOKKOS/pair_sw_kokkos.cpp +++ b/src/KOKKOS/pair_sw_kokkos.cpp @@ -186,9 +186,9 @@ void PairSWKokkos::compute(int eflag_in, int vflag_in) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_tersoff_kokkos.cpp b/src/KOKKOS/pair_tersoff_kokkos.cpp index 1a0d45e4357..c2099f95b56 100644 --- a/src/KOKKOS/pair_tersoff_kokkos.cpp +++ b/src/KOKKOS/pair_tersoff_kokkos.cpp @@ -293,9 +293,9 @@ void PairTersoffKokkos::compute(int eflag_in, int vflag_in) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_tersoff_mod_kokkos.cpp b/src/KOKKOS/pair_tersoff_mod_kokkos.cpp index b941755d4b5..3e651f14338 100644 --- a/src/KOKKOS/pair_tersoff_mod_kokkos.cpp +++ b/src/KOKKOS/pair_tersoff_mod_kokkos.cpp @@ -283,9 +283,9 @@ void PairTersoffMODKokkos::compute(int eflag_in, int vflag_in) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp b/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp index 08d6cb17d7f..3d6d1ea27b2 100644 --- a/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp +++ b/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp @@ -296,9 +296,9 @@ void PairTersoffZBLKokkos::compute(int eflag_in, int vflag_in) // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_eatom = decltype(dup_eatom)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_eatom = {}; + dup_vatom = {}; } }