From 7ec5fda5243e599d83af841b5c38198a2f7f05fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Tue, 21 May 2024 11:31:18 +0000 Subject: [PATCH 01/54] Removed accidentally included #include "hip/amd_detail/host_defines.h" --- library/src/rng/mrg.hpp | 1 - library/src/rng/mtgp32.hpp | 1 - library/src/rng/xorwow.hpp | 1 - 3 files changed, 3 deletions(-) diff --git a/library/src/rng/mrg.hpp b/library/src/rng/mrg.hpp index a01ac23ba..357f06f4b 100644 --- a/library/src/rng/mrg.hpp +++ b/library/src/rng/mrg.hpp @@ -30,7 +30,6 @@ #include "generator_type.hpp" #include "system.hpp" -#include #include #include #include diff --git a/library/src/rng/mtgp32.hpp b/library/src/rng/mtgp32.hpp index fa81defbe..4ca55957a 100644 --- a/library/src/rng/mtgp32.hpp +++ b/library/src/rng/mtgp32.hpp @@ -63,7 +63,6 @@ #include "generator_type.hpp" #include "system.hpp" -#include #include #include #include diff --git a/library/src/rng/xorwow.hpp b/library/src/rng/xorwow.hpp index 7b0d4ee99..9c34d5951 100644 --- a/library/src/rng/xorwow.hpp +++ b/library/src/rng/xorwow.hpp @@ -29,7 +29,6 @@ #include "generator_type.hpp" #include "system.hpp" -#include #include #include From 32922c83b130d4f1a330d8b0f7ce3fd7ffd1dde8 Mon Sep 17 00:00:00 2001 From: Gergely Meszaros Date: Fri, 5 Apr 2024 08:29:42 +0000 Subject: [PATCH 02/54] chore(gitignore): ignore python venvs --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 18fd6e97e..7ad4d67a5 100644 --- a/.gitignore +++ b/.gitignore @@ -88,3 +88,4 @@ CMakeLists.txt.user # Python __pycache__ +.venv From 1f6c4d1fc1a13830b56e3412e9e2e0d6b7936c5c Mon Sep 17 00:00:00 2001 From: Gergely Meszaros Date: Fri, 5 Apr 2024 08:30:43 +0000 Subject: [PATCH 03/54] docs(api reference): rocm-docs-core headers and stylesheets in doxyfile rocm-docs-core distributes headers and stylesheets for doxygen for embedding its HTML output into sphinx. These mostly fix dark-theme and other minor visual issues when doxygen output is used this way. --- docs/doxygen/Doxyfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index ade283856..b0ae0c543 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -1131,7 +1131,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = ../_doxygen/header.html # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1141,7 +1141,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = ../_doxygen/footer.html # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1153,7 +1153,7 @@ HTML_FOOTER = # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_STYLESHEET = +HTML_STYLESHEET = ../_doxygen/stylesheet.css # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets @@ -1166,7 +1166,7 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = ../_doxygen/extra_stylesheet.css # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note From 789ce4fcd1fe54367aa175144ed6b10f2969a60b Mon Sep 17 00:00:00 2001 From: Nol Moonen Date: Fri, 12 Apr 2024 09:41:51 +0000 Subject: [PATCH 04/54] improve accuracy of poisson histogram test --- test/internal/test_poisson_distribution.cpp | 63 ++++++++++++++------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/test/internal/test_poisson_distribution.cpp b/test/internal/test_poisson_distribution.cpp index 968247bf4..2b3ce0207 100644 --- a/test/internal/test_poisson_distribution.cpp +++ b/test/internal/test_poisson_distribution.cpp @@ -86,9 +86,9 @@ TEST_P(poisson_distribution_tests, histogram_compare) { const double lambda = GetParam(); - std::random_device rd; - std::mt19937 gen(rd()); - std::poisson_distribution host_dis(lambda); + const unsigned int seed = std::random_device{}(); + SCOPED_TRACE(testing::Message() << "with seed = " << seed); + std::mt19937 gen(seed); poisson_distribution dis; dis.set_lambda(lambda); @@ -96,37 +96,60 @@ TEST_P(poisson_distribution_tests, histogram_compare) const size_t samples_count = static_cast(std::max(2.0, sqrt(lambda))) * 100000; const size_t bin_size = static_cast(std::max(2.0, sqrt(lambda))); const size_t bins_count = static_cast((2.0 * lambda + 10.0) / bin_size); - std::vector historgram0(bins_count); - std::vector historgram1(bins_count); + std::vector histogram_rocrand(bins_count); for(size_t si = 0; si < samples_count; si++) { - const unsigned int v = host_dis(gen); + const unsigned int v = dis(gen()); const size_t bin = v / bin_size; if(bin < bins_count) { - historgram0[bin]++; + histogram_rocrand[bin]++; } } - for(size_t si = 0; si < samples_count; si++) + dis.deallocate(); + + // for small lambda, histogram test is inaccurate due to relatively large bins + // for large lambda, expected value calculation is inaccurate due to non-finite terms + if(lambda <= 50.0) { - const unsigned int v = dis(gen()); - const size_t bin = v / bin_size; - if(bin < bins_count) + for(size_t bi = 0; bi < bins_count; bi++) { - historgram1[bin]++; + const unsigned int h = histogram_rocrand[bi]; + double tmp = 0.0; + for(size_t i = 0; i < bin_size; ++i) + { + const int k = bi * bin_size + i; + tmp += std::pow(lambda, k) * std::exp(-lambda) / std::tgamma(k + 1.0); + } + const unsigned int actual = std::roundl(samples_count * tmp); + + // Very loose comparison + EXPECT_NEAR(h, actual, std::max(samples_count * 1e-3, actual * 1e-1)); } } - - dis.deallocate(); - - // Very loose comparison - for(size_t bi = 0; bi < bins_count; bi++) + else { - const unsigned int h0 = historgram0[bi]; - const unsigned int h1 = historgram1[bi]; - EXPECT_NEAR(h0, h1, std::max(samples_count * 1e-3, std::max(h0, h1) * 1e-1)); + std::poisson_distribution host_dis(lambda); + std::vector histogram_stl(bins_count); + for(size_t si = 0; si < samples_count; si++) + { + const unsigned int v = host_dis(gen); + const size_t bin = v / bin_size; + if(bin < bins_count) + { + histogram_stl[bin]++; + } + } + + // Very loose comparison + for(size_t bi = 0; bi < bins_count; bi++) + { + const unsigned int h0 = histogram_rocrand[bi]; + const unsigned int h1 = histogram_stl[bi]; + EXPECT_NEAR(h0, h1, std::max(samples_count * 1e-3, std::max(h0, h1) * 1e-1)); + } } } From e2c28347b431d2b16d39cf8acfdbbeb1ff72a469 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Mon, 15 Apr 2024 07:52:26 +0000 Subject: [PATCH 05/54] fix format and copyright dates --- library/src/rng/system.hpp | 2 +- test/internal/test_cpp_utils.cpp | 2 +- tools/bin2typed.cpp | 2 +- tools/scrambled_sobol32_direction_vector_generator.cpp | 2 +- tools/scrambled_sobol64_direction_vector_generator.cpp | 2 +- tools/sobol_utils.hpp | 10 +++++----- tools/utils.hpp | 10 +++++----- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/library/src/rng/system.hpp b/library/src/rng/system.hpp index 424d0f0d6..793d245cc 100644 --- a/library/src/rng/system.hpp +++ b/library/src/rng/system.hpp @@ -39,7 +39,7 @@ #include #if defined(ROCRAND_PARALLEL_STL) && __has_include() -#define ROCRAND_USE_PARALLEL_STL + #define ROCRAND_USE_PARALLEL_STL #include #endif #include diff --git a/test/internal/test_cpp_utils.cpp b/test/internal/test_cpp_utils.cpp index 08741abf4..0d926994f 100644 --- a/test/internal/test_cpp_utils.cpp +++ b/test/internal/test_cpp_utils.cpp @@ -59,7 +59,7 @@ TEST(rocrand_cpp_utils_tests, numeric_combinations) ASSERT_EQ(combinations.size(), A.size() * B.size() * C.size()); - const std::set> combination_set(combinations.begin(), combinations.end()); + const std::set> combination_set(combinations.begin(), combinations.end()); ASSERT_EQ(combinations.size(), combination_set.size()) << "Not all items are unique"; for(auto [a, b, c] : combinations) diff --git a/tools/bin2typed.cpp b/tools/bin2typed.cpp index 59c11d8e7..f49751c4e 100644 --- a/tools/bin2typed.cpp +++ b/tools/bin2typed.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/tools/scrambled_sobol32_direction_vector_generator.cpp b/tools/scrambled_sobol32_direction_vector_generator.cpp index 5833c2665..95e71129f 100644 --- a/tools/scrambled_sobol32_direction_vector_generator.cpp +++ b/tools/scrambled_sobol32_direction_vector_generator.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/tools/scrambled_sobol64_direction_vector_generator.cpp b/tools/scrambled_sobol64_direction_vector_generator.cpp index 80b99d81b..40aaeb04e 100644 --- a/tools/scrambled_sobol64_direction_vector_generator.cpp +++ b/tools/scrambled_sobol64_direction_vector_generator.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal diff --git a/tools/sobol_utils.hpp b/tools/sobol_utils.hpp index a3a67a08d..a18f2f023 100644 --- a/tools/sobol_utils.hpp +++ b/tools/sobol_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -22,13 +22,13 @@ #define ROCRAND_TOOLS_SOBOL_UTILS_HPP_ #if __has_include() -#include + #include #else -#include + #include namespace std { - namespace filesystem = experimental::filesystem; -} +namespace filesystem = experimental::filesystem; +} // namespace std #endif #include diff --git a/tools/utils.hpp b/tools/utils.hpp index d0ac10c05..527752d04 100644 --- a/tools/utils.hpp +++ b/tools/utils.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -22,13 +22,13 @@ #define ROCRAND_TOOLS_UTILS_HPP_ #if __has_include() -#include + #include #else -#include + #include namespace std { - namespace filesystem = experimental::filesystem; -} +namespace filesystem = experimental::filesystem; +} // namespace std #endif #include From 5b659f93f7581879de4d4bbaa197560db86ae1b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Mon, 22 Apr 2024 14:22:15 +0000 Subject: [PATCH 06/54] feat(test): Added CMake option RUN_EXTRA_TESTS --- CMakeLists.txt | 1 + test/CMakeLists.txt | 3 +++ test/test_common.hpp | 30 +++++++++++++++++++++++++----- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aa162c5ea..30076d6e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ cmake_dependent_option(BUILD_BENCHMARK_TUNING option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) option(CODE_COVERAGE "Build with code coverage flags (clang only)" OFF) option(DEPENDENCIES_FORCE_DOWNLOAD "Don't search the system for dependencies, always download them" OFF) +cmake_dependent_option(RUN_SLOW_TESTS "Run extra tests with CTest. These cover niche functionality and take long time" OFF "BUILD_TEST" OFF) # Install prefix set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e8ee4ce6f..8ab3691b9 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -46,6 +46,9 @@ function(add_relative_test test_name test_target) file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}") add_test(NAME "${test_name}" COMMAND "./${rel_path}") file(APPEND "${INSTALL_TEST_FILE}" "add_test(${test_name} \"../${EXE_NAME}\")\n") + if(RUN_SLOW_TESTS) + set_tests_properties("${test_name}" PROPERTIES "ENVIRONMENT" "RUN_SLOW_TESTS=1") + endif() endfunction() # Get rocRAND tests source files diff --git a/test/test_common.hpp b/test/test_common.hpp index d06294056..57b88edfb 100644 --- a/test/test_common.hpp +++ b/test/test_common.hpp @@ -42,25 +42,25 @@ } #ifdef _MSC_VER -inline bool use_hmm() +inline bool is_environment_variable_set_to_1(const char* name) { char buffer[2]{}; size_t size; - if(getenv_s(&size, buffer, "ROCRAND_USE_HMM") != 0) + if(getenv_s(&size, buffer, name) != 0) { return false; } return strcmp(buffer, "1") == 0; } #else -inline bool use_hmm() +inline bool is_environment_variable_set_to_1(const char* name) { - if (getenv("ROCRAND_USE_HMM") == nullptr) + if(getenv(name) == nullptr) { return false; } - if (strcmp(getenv("ROCRAND_USE_HMM"), "1") == 0) + if(strcmp(getenv(name), "1") == 0) { return true; } @@ -68,6 +68,26 @@ inline bool use_hmm() } #endif +inline bool are_slow_tests_enabled() +{ + return is_environment_variable_set_to_1("RUN_SLOW_TESTS"); +} + +#define ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED() \ + do \ + { \ + if(!are_slow_tests_enabled()) \ + { \ + GTEST_SKIP() << "This test can be enabled via environment variable RUN_SLOW_TESTS=1"; \ + } \ + } \ + while(0) + +inline bool use_hmm() +{ + return is_environment_variable_set_to_1("ROCRAND_USE_HMM"); +} + // Helper for HMM allocations: if HMM is requested through // setting environment variable ROCRAND_USE_HMM=1 template From ca16c1c4f6994092996e79007a994c86df93df86 Mon Sep 17 00:00:00 2001 From: Nick Breed Date: Wed, 24 Apr 2024 09:46:24 +0000 Subject: [PATCH 07/54] Removed deprecated internal headers, src/rng/distribution/distributions.hpp and src/rng/device_engines.hpp --- library/src/rng/device_engines.hpp | 31 ---------------- .../rng/distribution/device_distributions.hpp | 36 ------------------- 2 files changed, 67 deletions(-) delete mode 100644 library/src/rng/device_engines.hpp delete mode 100644 library/src/rng/distribution/device_distributions.hpp diff --git a/library/src/rng/device_engines.hpp b/library/src/rng/device_engines.hpp deleted file mode 100644 index 30ed89c7f..000000000 --- a/library/src/rng/device_engines.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#ifndef ROCRAND_RNG_DEVICE_ENGINES_H_ -#define ROCRAND_RNG_DEVICE_ENGINES_H_ - -#include - -#include - -ROCRAND_PRAGMA_MESSAGE("Internal device_engines.hpp header has been deprecated. Please include the " - "necessary headers directly.") - -#endif // ROCRAND_RNG_DEVICE_ENGINES_H_ diff --git a/library/src/rng/distribution/device_distributions.hpp b/library/src/rng/distribution/device_distributions.hpp deleted file mode 100644 index c3d973eab..000000000 --- a/library/src/rng/distribution/device_distributions.hpp +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#ifndef ROCRAND_RNG_DISTRIBUTION_DEVICE_DISTRIBUTIONS_H_ -#define ROCRAND_RNG_DISTRIBUTION_DEVICE_DISTRIBUTIONS_H_ - -#include "../device_engines.hpp" - -#include -#include -#include -#include - -#include - -ROCRAND_PRAGMA_MESSAGE("Internal device_distributions.hpp header has been deprecated. Please " - "include the necessary engine- or distribution-headers directly.") - -#endif // ROCRAND_RNG_DISTRIBUTION_DEVICE_DISTRIBUTIONS_H_ From c28da46a008c891a474a7628fe86d13f94148df5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Fri, 19 Apr 2024 14:00:26 +0000 Subject: [PATCH 08/54] Using .lint:clang-format --- .gitlab-ci.yml | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 050f19b2a..43d8243f9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -31,6 +31,7 @@ include: - /defaults.yaml - /deps-cmake.yaml - /deps-docs.yaml + - /deps-format.yaml - /deps-rocm.yaml - /deps-nvcc.yaml - /deps-windows.yaml @@ -46,20 +47,7 @@ stages: clang-format: extends: - - .deps:rocm - stage: lint - needs: [] - tags: - - build - variables: - CLANG_FORMAT: "/opt/rocm/llvm/bin/clang-format" - GIT_CLANG_FORMAT: "/opt/rocm/llvm/bin/git-clang-format" - rules: - - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' - script: - - cd $CI_PROJECT_DIR - - git config --global --add safe.directory $CI_PROJECT_DIR - - scripts/code-format/check-format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT" + - .lint:clang-format copyright-date: extends: From 17aaed3e7a66a432e41f80d7250a073d8d807a2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Fri, 26 Apr 2024 11:18:06 +0000 Subject: [PATCH 09/54] feat(test): Added large size tests for host generators --- test/test_rocrand_host.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/test/test_rocrand_host.cpp b/test/test_rocrand_host.cpp index 96f5de83c..f481f61ff 100644 --- a/test/test_rocrand_host.cpp +++ b/test/test_rocrand_host.cpp @@ -69,14 +69,12 @@ constexpr rocrand_rng_type host_rng_types[] = { class rocrand_generate_host_test : public ::testing::TestWithParam {}; -TEST_P(rocrand_generate_host_test, int_test) +void test_int(const rocrand_rng_type rng_type, const size_t test_size) { - const rocrand_rng_type rng_type = GetParam(); - rocrand_generator generator; ROCRAND_CHECK(rocrand_create_generator_host(&generator, rng_type)); - std::vector results(11111); + std::vector results(test_size); for(size_t i = 0; i < seeds_count + random_seeds_count; ++i) { const auto seed = i < seeds_count ? seeds[i] : rand(); @@ -103,6 +101,18 @@ TEST_P(rocrand_generate_host_test, int_test) ROCRAND_CHECK(rocrand_destroy_generator(generator)); } +TEST_P(rocrand_generate_host_test, int_test) +{ + test_int(GetParam(), 11111); +} + +TEST_P(rocrand_generate_host_test, int_test_large) +{ + ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED(); + constexpr size_t large_test_size = size_t(INT_MAX) + 1; + test_int(GetParam(), large_test_size); +} + template void test_int_parity(rocrand_rng_type rng_type, F generate, From 99bc62ebd46b02a9dd1c2b09c9223c38b05fcdc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Mon, 22 Apr 2024 15:00:50 +0000 Subject: [PATCH 10/54] fix(generator): Fixed the usage of min in host generators --- library/src/rng/lfsr113.hpp | 3 ++- library/src/rng/mrg.hpp | 3 ++- library/src/rng/mt19937.hpp | 3 ++- library/src/rng/mtgp32.hpp | 3 ++- library/src/rng/philox4x32_10.hpp | 3 ++- library/src/rng/sobol.hpp | 3 ++- library/src/rng/threefry.hpp | 2 +- library/src/rng/utils/cpp_utils.hpp | 26 ++++++++++++++++++++++++++ library/src/rng/xorwow.hpp | 3 ++- 9 files changed, 41 insertions(+), 8 deletions(-) diff --git a/library/src/rng/lfsr113.hpp b/library/src/rng/lfsr113.hpp index a3acd02ea..6cb86639f 100644 --- a/library/src/rng/lfsr113.hpp +++ b/library/src/rng/lfsr113.hpp @@ -28,6 +28,7 @@ #include "distributions.hpp" #include "generator_type.hpp" #include "system.hpp" +#include "utils/cpp_utils.hpp" #include #include @@ -90,7 +91,7 @@ __host__ __device__ __forceinline__ void generate_lfsr113(dim3 block_idx, const uintptr_t uintptr = reinterpret_cast(data); const size_t misalignment = (output_width - uintptr / sizeof(T) % output_width) % output_width; - const unsigned int head_size = min(n, misalignment); + const unsigned int head_size = cpp_utils::min(n, misalignment); const unsigned int tail_size = (n - head_size) % output_width; const size_t vec_n = (n - head_size) / output_width; diff --git a/library/src/rng/mrg.hpp b/library/src/rng/mrg.hpp index 357f06f4b..bb68d40f5 100644 --- a/library/src/rng/mrg.hpp +++ b/library/src/rng/mrg.hpp @@ -29,6 +29,7 @@ #include "distributions.hpp" #include "generator_type.hpp" #include "system.hpp" +#include "utils/cpp_utils.hpp" #include #include @@ -91,7 +92,7 @@ __host__ __device__ __forceinline__ void generate_mrg(dim3 block_idx, const uintptr_t uintptr = reinterpret_cast(data); const size_t misalignment = (output_width - uintptr / sizeof(T) % output_width) % output_width; - const unsigned int head_size = min(n, misalignment); + const unsigned int head_size = cpp_utils::min(n, misalignment); const unsigned int tail_size = (n - head_size) % output_width; const size_t vec_n = (n - head_size) / output_width; diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index c2a691bbe..deb506628 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -56,6 +56,7 @@ #include "distributions.hpp" #include "generator_type.hpp" #include "mt19937_octo_engine.hpp" +#include "utils/cpp_utils.hpp" #include "config/config_defaults.hpp" #include "config_types.hpp" @@ -712,7 +713,7 @@ class mt19937_generator_template : public generator_impl_base const uintptr_t uintptr = reinterpret_cast(data); const size_t misalignment = (output_width - uintptr / sizeof(T) % output_width) % output_width; - const unsigned int head_size = min(size, misalignment); + const unsigned int head_size = cpp_utils::min(size, misalignment); const unsigned int tail_size = (size - head_size) % output_width; const size_t vec_size = (size - head_size) / output_width; diff --git a/library/src/rng/mtgp32.hpp b/library/src/rng/mtgp32.hpp index 4ca55957a..5055d99a9 100644 --- a/library/src/rng/mtgp32.hpp +++ b/library/src/rng/mtgp32.hpp @@ -62,6 +62,7 @@ #include "distributions.hpp" #include "generator_type.hpp" #include "system.hpp" +#include "utils/cpp_utils.hpp" #include #include @@ -262,7 +263,7 @@ __host__ __device__ __forceinline__ void generate_mtgp(dim3 block_idx, const uintptr_t uintptr = reinterpret_cast(data); const size_t misalignment = (output_width - uintptr / sizeof(T) % output_width) % output_width; - const unsigned int head_size = min(n, misalignment); + const unsigned int head_size = cpp_utils::min(n, misalignment); const unsigned int tail_size = (n - head_size) % output_width; const size_t vec_n = (n - head_size) / output_width; diff --git a/library/src/rng/philox4x32_10.hpp b/library/src/rng/philox4x32_10.hpp index 86a071a41..ccc70c543 100644 --- a/library/src/rng/philox4x32_10.hpp +++ b/library/src/rng/philox4x32_10.hpp @@ -60,6 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "distributions.hpp" #include "generator_type.hpp" #include "system.hpp" +#include "utils/cpp_utils.hpp" #include #include @@ -130,7 +131,7 @@ __host__ __device__ __forceinline__ void generate_philox(dim3 const uintptr_t uintptr = reinterpret_cast(data); const size_t misalignment = (full_output_width - uintptr / sizeof(T) % full_output_width) % full_output_width; - const unsigned int head_size = min(n, misalignment); + const unsigned int head_size = cpp_utils::min(n, misalignment); const unsigned int tail_size = (n - head_size) % full_output_width; const size_t vec_n = (n - head_size) / full_output_width; diff --git a/library/src/rng/sobol.hpp b/library/src/rng/sobol.hpp index 7b1b140b8..9ebce5177 100644 --- a/library/src/rng/sobol.hpp +++ b/library/src/rng/sobol.hpp @@ -26,6 +26,7 @@ #include "distributions.hpp" #include "generator_type.hpp" #include "system.hpp" +#include "utils/cpp_utils.hpp" #include #include @@ -205,7 +206,7 @@ void generate_sobol_host(dim3 block_idx, { const uintptr_t uintptr = reinterpret_cast(data); const size_t misalignment = (output_per_thread - uintptr / sizeof(T)) % output_per_thread; - const unsigned int head_size = min(n, misalignment); + const unsigned int head_size = cpp_utils::min(n, misalignment); const unsigned int tail_size = (n - head_size) % output_per_thread; const size_t vec_n = (n - head_size) / output_per_thread; diff --git a/library/src/rng/threefry.hpp b/library/src/rng/threefry.hpp index 8982c855c..f2b379e86 100644 --- a/library/src/rng/threefry.hpp +++ b/library/src/rng/threefry.hpp @@ -114,7 +114,7 @@ __host__ __device__ __forceinline__ void generate_threefry(dim3 block_id const uintptr_t uintptr = reinterpret_cast(data); const size_t misalignment = (full_output_width - uintptr / sizeof(T) % full_output_width) % full_output_width; - const unsigned int head_size = min(n, misalignment); + const unsigned int head_size = cpp_utils::min(n, misalignment); const unsigned int tail_size = (n - head_size) % full_output_width; const size_t vec_n = (n - head_size) / full_output_width; diff --git a/library/src/rng/utils/cpp_utils.hpp b/library/src/rng/utils/cpp_utils.hpp index 341cb9f1d..fdb63d8d3 100644 --- a/library/src/rng/utils/cpp_utils.hpp +++ b/library/src/rng/utils/cpp_utils.hpp @@ -249,6 +249,32 @@ struct vec_wrapper template __host__ __device__ vec_wrapper(V) -> vec_wrapper; +/// \brief Returns the maximum of its arguments. +/// \note This function must be the choice in `__host__ __device__` and preferably on +/// `__device__` functions too. In host code, both this and `std::max` is allowed. +/// +/// The reason for that is that the platform-provided `__clang_hip_math.h` header provides +/// different overload sets for `::min` and `::max` between device and host code. That implementation +/// can result in unwanted implicit conversions to `int` in host code. +template +__host__ __device__ constexpr T max(const T& a, const T& b) +{ + return a < b ? b : a; +} + +/// \brief Returns the minimum of its arguments. +/// \note This function must be the choice in `__host__ __device__` and preferably on +/// `__device__` functions too. In host code, both this and `std::min` is allowed. +/// +/// The reason for that is that the platform-provided `__clang_hip_math.h` header provides +/// different overload sets for `::min` and `::max` between device and host code. That implementation +/// can result in unwanted implicit conversions to `int` in host code. +template +__host__ __device__ constexpr T min(const T& a, const T& b) +{ + return a < b ? a : b; +} + } // end namespace rocrand_impl::cpp_utils #endif // ROCRAND_RNG_CPP_UTILS_HPP_ diff --git a/library/src/rng/xorwow.hpp b/library/src/rng/xorwow.hpp index 9c34d5951..23af661e3 100644 --- a/library/src/rng/xorwow.hpp +++ b/library/src/rng/xorwow.hpp @@ -28,6 +28,7 @@ #include "distributions.hpp" #include "generator_type.hpp" #include "system.hpp" +#include "utils/cpp_utils.hpp" #include #include @@ -81,7 +82,7 @@ __host__ __device__ __forceinline__ void generate_xorwow(dim3 block_idx, const unsigned int thread_id = block_idx.x * BlockSize + thread_idx.x; const uintptr_t uintptr = reinterpret_cast(data); const size_t misalignment = (output_width - uintptr / sizeof(T) % output_width) % output_width; - const unsigned int head_size = min(n, misalignment); + const unsigned int head_size = cpp_utils::min(n, misalignment); const unsigned int tail_size = (n - head_size) % output_width; const size_t vec_n = (n - head_size) / output_width; From 7ffc14e22948cc543444b59d3bfa52d344cfc259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gergely=20M=C3=A9sz=C3=A1ros?= Date: Fri, 3 May 2024 15:10:42 +0000 Subject: [PATCH 11/54] docs(dyn_ordering): Use GPU_TARGETS instead of AMDGPU_TARGETS AMDGPU_TARGETS doesn't pick up updates correctly (needs cache clean) whereas GPU_TARGETS does. Every other doc and CI too refers to GPU_TARGETS. --- docs/conceptual/dynamic_ordering_configuration.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conceptual/dynamic_ordering_configuration.rst b/docs/conceptual/dynamic_ordering_configuration.rst index e0bac59aa..bba95db08 100644 --- a/docs/conceptual/dynamic_ordering_configuration.rst +++ b/docs/conceptual/dynamic_ordering_configuration.rst @@ -22,14 +22,14 @@ Building the tuning benchmarks The principle of the tuning is very simple: the random number generation kernel is run for a list of kernel block size / kernel grid size combinations, and the fastest combination is selected as the dynamic ordering configuration for the particular device. rocRAND provides an executable target that runs the benchmarks with all these combinations: `benchmark_rocrand_tuning`. This target is disabled by default, and can be enabled and built by the following snippet. -Use the `AMDGPU_TARGET` variable to specify the comma-separated list of GPU architectures to build the benchmarks for. To acquire the architecture of the GPU(s) installed, run `rocminfo`, and look for `gfx` in the "ISA Info" section. :: +Use the `GPU_TARGETS` variable to specify the comma-separated list of GPU architectures to build the benchmarks for. To acquire the architecture of the GPU(s) installed, run `rocminfo`, and look for `gfx` in the "ISA Info" section. :: $ cd rocRAND $ cmake -S . -B ./build -D BUILD_BENCHMARK=ON -D BUILD_BENCHMARK_TUNING=ON -D CMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ - -D AMDGPU_TARGETS=gfx908 + -D GPU_TARGETS=gfx908 $ cmake --build build --target benchmark_rocrand_tuning Additionally, the following CMake cache variables control the generation of the benchmarked matrix: From 86e62b50b945592ad7bdd297fdbea88177cc4920 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Fri, 10 May 2024 18:06:00 +0500 Subject: [PATCH 12/54] Use alias method in rocrand_discrete for MTGP32, LFSR113 and ThreeFry discrete_alias is faster than discrete_cdf. Though discrete_cdf can be used with PRNGs, it is supposed to be used with QRNGs (Sobol generators) as it maintains quasi-randomness. --- CHANGELOG.md | 6 ++++++ library/include/rocrand/rocrand_discrete.h | 12 ++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd0068e76..38c576334 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ Documentation for rocRAND is available at [https://rocm.docs.amd.com/projects/rocRAND/en/latest/](https://rocm.docs.amd.com/projects/rocRAND/en/latest/) +## (Unreleased) rocRAND-3.2.0 for ROCm 6.3.0 + +### Changes + +* `rocrand_discrete` for MTGP32, LFSR113 and ThreeFry generators now uses the alias method, which is faster than binary search in CDF. + ## (Unreleased) rocRAND-3.1.1 for ROCm 6.2.0 ## Fixes diff --git a/library/include/rocrand/rocrand_discrete.h b/library/include/rocrand/rocrand_discrete.h index 1dc591e05..f9bf09f5e 100644 --- a/library/include/rocrand/rocrand_discrete.h +++ b/library/include/rocrand/rocrand_discrete.h @@ -281,7 +281,7 @@ __forceinline__ __device__ unsigned int rocrand_discrete(rocrand_state_mtgp32* state, const rocrand_discrete_distribution discrete_distribution) { - return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution); + return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution); } /** @@ -376,7 +376,7 @@ __forceinline__ __device__ __host__ unsigned int rocrand_discrete(rocrand_state_lfsr113* state, const rocrand_discrete_distribution discrete_distribution) { - return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution); + return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution); } /** @@ -395,7 +395,7 @@ __forceinline__ __device__ __host__ unsigned int rocrand_discrete(rocrand_state_threefry2x32_20* state, const rocrand_discrete_distribution discrete_distribution) { - return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution); + return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution); } /** @@ -414,7 +414,7 @@ __forceinline__ __device__ __host__ unsigned int rocrand_discrete(rocrand_state_threefry2x64_20* state, const rocrand_discrete_distribution discrete_distribution) { - return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution); + return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution); } /** @@ -433,7 +433,7 @@ __forceinline__ __device__ __host__ unsigned int rocrand_discrete(rocrand_state_threefry4x32_20* state, const rocrand_discrete_distribution discrete_distribution) { - return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution); + return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution); } /** @@ -452,7 +452,7 @@ __forceinline__ __device__ __host__ unsigned int rocrand_discrete(rocrand_state_threefry4x64_20* state, const rocrand_discrete_distribution discrete_distribution) { - return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution); + return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution); } /** @} */ // end of group rocranddevice From 2eb5e0743c98df6b627a74fe845a1d8898e5e4fe Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Fri, 16 Feb 2024 12:58:35 +0000 Subject: [PATCH 13/54] refactor mt19937 to support host version as well --- library/src/rng/mt19937.hpp | 255 ++++++++++++-------- test/internal/test_rocrand_mt19937_prng.cpp | 37 ++- 2 files changed, 166 insertions(+), 126 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index deb506628..cafe23c2e 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -57,6 +57,7 @@ #include "generator_type.hpp" #include "mt19937_octo_engine.hpp" #include "utils/cpp_utils.hpp" +#include "system.hpp" #include "config/config_defaults.hpp" #include "config_types.hpp" @@ -79,11 +80,14 @@ __forceinline__ __device__ unsigned int wrap_n(unsigned int i) // Config is not actually used for kernel launch here, but is needed to check the number of generators // As this kernel is not dependent on any type just use void for the config, as mt19937 is not tuned for types independently, so all configs are the same for different types. -template -ROCRAND_KERNEL __launch_bounds__(jump_ahead_thread_count) void jump_ahead_kernel( - unsigned int* __restrict__ engines, - unsigned long long seed, - const unsigned int* __restrict__ jump) +template +__host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, + dim3 thread_idx, + dim3 /*grid_dim*/, + dim3 block_dim, + unsigned int* __restrict__ engines, + unsigned long long seed, + const unsigned int* __restrict__ jump) { constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); constexpr unsigned int GeneratorCount @@ -92,16 +96,16 @@ ROCRAND_KERNEL __launch_bounds__(jump_ahead_thread_count) void jump_ahead_kernel && mt19937_jumps_radixes == 2, "Not enough rocrand_h_mt19937_jump values to initialize all generators"); - constexpr unsigned int block_size = jump_ahead_thread_count; - constexpr unsigned int items_per_thread = (mt19937_constants::n + block_size - 1) / block_size; - constexpr unsigned int tail_n = mt19937_constants::n - (items_per_thread - 1) * block_size; + const unsigned int block_size = block_dim.x; + const unsigned int items_per_thread = (mt19937_constants::n + block_size - 1) / block_size; + const unsigned int tail_n = mt19937_constants::n - (items_per_thread - 1) * block_size; __shared__ unsigned int temp[mt19937_constants::n]; unsigned int state[items_per_thread]; // Initialize state 0 (engine_id = 0) used as a base for all engines. // It uses a recurrence relation so one thread calculates all n values. - if(threadIdx.x == 0) + if(thread_idx.x == 0) { const unsigned int seedu = (seed >> 32) ^ seed; temp[0] = seedu; @@ -114,14 +118,14 @@ ROCRAND_KERNEL __launch_bounds__(jump_ahead_thread_count) void jump_ahead_kernel for(unsigned int i = 0; i < items_per_thread; i++) { - if(i < items_per_thread - 1 || threadIdx.x < tail_n) // Check only for the last iteration + if(i < items_per_thread - 1 || thread_idx.x < tail_n) // Check only for the last iteration { - state[i] = temp[i * block_size + threadIdx.x]; + state[i] = temp[i * block_size + thread_idx.x]; } } __syncthreads(); - const unsigned int engine_id = blockIdx.x; + const unsigned int engine_id = block_idx.x; // Jump ahead by engine_id * 2 ^ 1000 using precomputed polynomials for jumps of // i * 2 ^ 1000 and mt19937_jumps_radix * i * 2 ^ 1000 values @@ -139,7 +143,7 @@ ROCRAND_KERNEL __launch_bounds__(jump_ahead_thread_count) void jump_ahead_kernel // Compute jumping ahead with standard Horner method unsigned int ptr = 0; - for(unsigned int i = threadIdx.x; i < mt19937_constants::n; i += block_size) + for(unsigned int i = thread_idx.x; i < mt19937_constants::n; i += block_size) { temp[i] = 0; } @@ -150,7 +154,7 @@ ROCRAND_KERNEL __launch_bounds__(jump_ahead_thread_count) void jump_ahead_kernel for(int pfi = mt19937_constants::mexp - 1; pfi >= 0; pfi--) { // Generate next state - if(threadIdx.x == 0) + if(thread_idx.x == 0) { unsigned int t0 = temp[ptr]; unsigned int t1 = temp[wrap_n(ptr + 1)]; @@ -167,9 +171,9 @@ ROCRAND_KERNEL __launch_bounds__(jump_ahead_thread_count) void jump_ahead_kernel // Add state to temp for(unsigned int i = 0; i < items_per_thread; i++) { - if(i < items_per_thread - 1 || threadIdx.x < tail_n) + if(i < items_per_thread - 1 || thread_idx.x < tail_n) { - temp[wrap_n(ptr + i * block_size + threadIdx.x)] ^= state[i]; + temp[wrap_n(ptr + i * block_size + thread_idx.x)] ^= state[i]; } } __syncthreads(); @@ -179,9 +183,9 @@ ROCRAND_KERNEL __launch_bounds__(jump_ahead_thread_count) void jump_ahead_kernel // Jump of the next power of 2 will be applied to the current state for(unsigned int i = 0; i < items_per_thread; i++) { - if(i < items_per_thread - 1 || threadIdx.x < tail_n) + if(i < items_per_thread - 1 || thread_idx.x < tail_n) { - state[i] = temp[wrap_n(ptr + i * block_size + threadIdx.x)]; + state[i] = temp[wrap_n(ptr + i * block_size + thread_idx.x)]; } } __syncthreads(); @@ -190,9 +194,9 @@ ROCRAND_KERNEL __launch_bounds__(jump_ahead_thread_count) void jump_ahead_kernel // Save state for(unsigned int i = 0; i < items_per_thread; i++) { - if(i < items_per_thread - 1 || threadIdx.x < tail_n) + if(i < items_per_thread - 1 || thread_idx.x < tail_n) { - engines[engine_id * mt19937_constants::n + i * block_size + threadIdx.x] = state[i]; + engines[engine_id * mt19937_constants::n + i * block_size + thread_idx.x] = state[i]; } } } @@ -200,9 +204,12 @@ ROCRAND_KERNEL __launch_bounds__(jump_ahead_thread_count) void jump_ahead_kernel // This kernel is not explicitly tuned, but uses the same configs as the generate-kernels. // As this kernel is not dependent on any type just use void for the config, as mt19937 is not tuned for types independently, so all configs are the same for different types. template -ROCRAND_KERNEL - __launch_bounds__((get_block_size(IsDynamic))) void init_engines_kernel( - unsigned int* __restrict__ octo_engines, const unsigned int* __restrict__ engines) +__host__ __device__ inline void init_engines_mt19937(dim3 block_idx, + dim3 thread_idx, + dim3 /*grid_dim*/, + dim3 /*block_dim*/, + unsigned int* __restrict__ octo_engines, + const unsigned int* __restrict__ engines) { constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); constexpr unsigned int block_size = config.threads; @@ -213,7 +220,7 @@ ROCRAND_KERNEL static_assert(block_size % threads_per_generator == 0, "All eight threads of the generator must be in the same block"); - const unsigned int thread_id = blockIdx.x * block_size + threadIdx.x; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x; // every eight octo engines gather from the same engine mt19937_octo_engine_accessor accessor(octo_engines); mt19937_octo_engine engine; @@ -223,16 +230,19 @@ ROCRAND_KERNEL } template -ROCRAND_KERNEL __launch_bounds__((get_block_size( - IsDynamic))) void generate_short_kernel(unsigned int* __restrict__ engines, - const unsigned int start_input, - T* __restrict__ data, - const size_t size, - VecT* __restrict__ vec_data, - const size_t vec_size, - const unsigned int head_size, - const unsigned int tail_size, - Distribution distribution) +__host__ __device__ inline void generate_short_mt19937(dim3 block_idx, + dim3 thread_idx, + dim3 /*grid_dim*/, + dim3 /*block_dim*/, + unsigned int* __restrict__ engines, + const unsigned int start_input, + T* __restrict__ data, + const size_t size, + VecT* __restrict__ vec_data, + const size_t vec_size, + const unsigned int head_size, + const unsigned int tail_size, + Distribution distribution) { constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); constexpr unsigned int block_size = config.threads; @@ -245,7 +255,7 @@ ROCRAND_KERNEL __launch_bounds__((get_block_size( constexpr unsigned int input_width = Distribution::input_width; constexpr unsigned int output_width = Distribution::output_width; - const unsigned int thread_id = blockIdx.x * block_size + threadIdx.x; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x; unsigned int input[input_width]; T output[output_width]; @@ -309,16 +319,19 @@ ROCRAND_KERNEL __launch_bounds__((get_block_size( } template -ROCRAND_KERNEL __launch_bounds__((get_block_size( - IsDynamic))) void generate_long_kernel(unsigned int* __restrict__ engines, - const unsigned int start_input, - T* __restrict__ data, - const size_t size, - VecT* __restrict__ vec_data, - const size_t vec_size, - const unsigned int head_size, - const unsigned int tail_size, - Distribution distribution) +__host__ __device__ inline void generate_long_mt19937(dim3 block_idx, + dim3 thread_idx, + dim3 /*grid_dim*/, + dim3 block_dim, + unsigned int* __restrict__ engines, + const unsigned int start_input, + T* __restrict__ data, + const size_t size, + VecT* __restrict__ vec_data, + const size_t vec_size, + const unsigned int head_size, + const unsigned int tail_size, + Distribution distribution) { constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); constexpr unsigned int block_size = config.threads; @@ -335,16 +348,16 @@ ROCRAND_KERNEL __launch_bounds__((get_block_size( constexpr unsigned int stride = block_size * grid_size; constexpr unsigned int full_stride = stride * inputs_per_state; - const unsigned int thread_id = blockIdx.x * block_size + threadIdx.x; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x; unsigned int input[input_width]; T output[output_width]; // Workaround: since load() and store() use the same indices, the compiler decides to keep - // computed addresses alive wasting 78 * 2 VGPRs. blockDim.x equals to block_size but it is + // computed addresses alive wasting 78 * 2 VGPRs. block_dim.x equals to block_size but it is // a runtime value so save() will compute new addresses. mt19937_octo_engine_accessor accessor(engines); - mt19937_octo_engine engine = accessor.load(blockIdx.x * blockDim.x + threadIdx.x); + mt19937_octo_engine engine = accessor.load(block_idx.x * block_dim.x + thread_idx.x); size_t base_index = 0; @@ -453,12 +466,15 @@ ROCRAND_KERNEL __launch_bounds__((get_block_size( accessor.save(thread_id, engine); } -template -class mt19937_generator_template : public generator_impl_base +} // end namespace rocrand_host::detail + +template +class rocrand_mt19937_template : public rocrand_generator_impl_base { public: - using base_type = generator_impl_base; + using base_type = rocrand_generator_impl_base; using octo_engine_type = mt19937_octo_engine; + using system_type = System; static constexpr inline unsigned int threads_per_generator = octo_engine_type::threads_per_generator; @@ -514,7 +530,8 @@ class mt19937_generator_template : public generator_impl_base { if(m_engines != nullptr) { - ROCRAND_HIP_FATAL_ASSERT(hipFree(m_engines)); + system_type::free(m_engines); + m_engines = nullptr; } } @@ -583,29 +600,36 @@ class mt19937_generator_template : public generator_impl_base if(m_engines != nullptr) { - ROCRAND_HIP_FATAL_ASSERT(hipFree(m_engines)); + system_type::free(m_engines); } // Allocate device random number engines - err = hipMalloc(reinterpret_cast(&m_engines), - m_generator_count * mt19937_constants::n * sizeof(unsigned int)); - if(err != hipSuccess) + rocrand_status status = system_type::alloc( + &m_engines, + m_generator_count * rocrand_host::detail::mt19937_constants::n * sizeof(unsigned int)); + if(status != ROCRAND_STATUS_SUCCESS) { - return ROCRAND_STATUS_ALLOCATION_FAILED; + return status; } unsigned int* d_engines{}; - err = hipMalloc(&d_engines, - m_generator_count * mt19937_constants::n * sizeof(unsigned int)); - if(err != hipSuccess) + status = system_type::alloc(&d_engines, + m_generator_count * rocrand_host::detail::mt19937_constants::n + * sizeof(unsigned int)); + if(status != ROCRAND_STATUS_SUCCESS) { - return ROCRAND_STATUS_ALLOCATION_FAILED; + return status; } unsigned int* d_mt19937_jump{}; + status = system_type::alloc(&d_mt19937_jump, sizeof(rocrand_h_mt19937_jump)); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } err = hipMalloc(&d_mt19937_jump, sizeof(rocrand_h_mt19937_jump)); if(err != hipSuccess) { - ROCRAND_HIP_FATAL_ASSERT(hipFree(d_engines)); + system_type::free(d_engines); return ROCRAND_STATUS_ALLOCATION_FAILED; } @@ -615,8 +639,8 @@ class mt19937_generator_template : public generator_impl_base hipMemcpyHostToDevice); if(err != hipSuccess) { - ROCRAND_HIP_FATAL_ASSERT(hipFree(d_engines)); - ROCRAND_HIP_FATAL_ASSERT(hipFree(d_mt19937_jump)); + system_type::free(d_engines); + system_type::free(d_mt19937_jump); return ROCRAND_STATUS_INTERNAL_ERROR; } @@ -624,59 +648,59 @@ class mt19937_generator_template : public generator_impl_base m_order, [&, this](auto is_dynamic) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - jump_ahead_kernel), - dim3(m_generator_count), - dim3(jump_ahead_thread_count), - 0, - m_stream, - d_engines, - m_seed, - d_mt19937_jump); + status = system_type::template launch< + rocrand_host::detail::jump_ahead_mt19937, + rocrand_host::detail::static_block_size_config_provider< + jump_ahead_thread_count>>(dim3(m_generator_count), + dim3(jump_ahead_thread_count), + 0, + m_stream, + d_engines, + m_seed, + d_mt19937_jump); }); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } err = hipStreamSynchronize(m_stream); if(err != hipSuccess) { - ROCRAND_HIP_FATAL_ASSERT(hipFree(d_engines)); - ROCRAND_HIP_FATAL_ASSERT(hipFree(d_mt19937_jump)); + system_type::free(d_engines); + system_type::free(d_mt19937_jump); return ROCRAND_STATUS_LAUNCH_FAILURE; } - err = hipFree(d_mt19937_jump); - if(err != hipSuccess) - { - ROCRAND_HIP_FATAL_ASSERT(hipFree(d_engines)); - return ROCRAND_STATUS_INTERNAL_ERROR; - } + system_type::free(d_mt19937_jump); // This kernel is not actually tuned for ordering, but config is needed for device-side compile time check of the generator count - dynamic_dispatch(m_order, - [&, this](auto is_dynamic) - { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(init_engines_kernel), - dim3(config.blocks), - dim3(config.threads), - 0, - m_stream, - m_engines, - d_engines); - }); + rocrand_host::detail::dynamic_dispatch( + m_order, + [&, this](auto is_dynamic) + { + status = system_type::template launch< + rocrand_host::detail::init_engines_mt19937>( + dim3(config.blocks), + dim3(config.threads), + 0, + m_stream, + m_engines, + d_engines); + }); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } err = hipStreamSynchronize(m_stream); if(err != hipSuccess) { - ROCRAND_HIP_FATAL_ASSERT(hipFree(d_engines)); + system_type::free(d_engines); return ROCRAND_STATUS_LAUNCH_FAILURE; } - err = hipFree(d_engines); - if(err != hipSuccess) - { - return ROCRAND_STATUS_INTERNAL_ERROR; - } + system_type::free(d_engines); m_engines_initialized = true; m_start_input = 0; @@ -745,15 +769,19 @@ class mt19937_generator_template : public generator_impl_base if(m_start_input > 0 && m_start_input + vec_size + extra <= full_stride) { - // Engines have enough values, generated by the previous generate_long_kernel call. + // Engines have enough values, generated by the previous generate_long_mt19937 call. // This kernel does not load and store engines but loads values directly from global // memory. dynamic_dispatch( m_order, [&, this](auto is_dynamic) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(generate_short_kernel), + status = system_type::template launch< + rocrand_host::detail::generate_short_mt19937>( dim3(config.blocks), dim3(config.threads), 0, @@ -768,6 +796,10 @@ class mt19937_generator_template : public generator_impl_base tail_size, distribution); }); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } } else { @@ -776,8 +808,12 @@ class mt19937_generator_template : public generator_impl_base m_order, [&, this](auto is_dynamic) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(generate_long_kernel), + status = system_type::template launch< + rocrand_host::detail::generate_long_mt19937>( dim3(config.blocks), dim3(config.threads), 0, @@ -792,6 +828,10 @@ class mt19937_generator_template : public generator_impl_base tail_size, distribution); }); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } } // check kernel status @@ -876,8 +916,9 @@ class mt19937_generator_template : public generator_impl_base unsigned int m_generator_count = 0; }; -using mt19937_generator - = mt19937_generator_template>; +using rocrand_mt19937 = rocrand_mt19937_template< + rocrand_system_device, + default_config_provider>; } // namespace rocrand_impl::host diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index c0d08fd81..9158f3764 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -738,10 +738,9 @@ TYPED_TEST(mt19937_generator_engine_tests, subsequence_test) // dummy config provider, kernel just needs to verify the amount of generators for the actual call using ConfigProvider = default_config_provider; - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - - jump_ahead_kernel), + rocrand_status status = rocrand_system_device::template launch< + rocrand_host::detail::jump_ahead_mt19937, + rocrand_host::detail::static_block_size_config_provider>( dim3(generator_count), dim3(generator_t::jump_ahead_thread_count), 0, @@ -1149,21 +1148,21 @@ TYPED_TEST(mt19937_generator_engine_tests, jump_ahead_test) unsigned int* d_engines1{}; HIP_CHECK(hipMalloc(&d_engines1, generator_count * n * sizeof(unsigned int))); - dynamic_dispatch(ROCRAND_ORDERING_PSEUDO_DEFAULT, - [&](auto is_dynamic) - { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(jump_ahead_kernel), - dim3(generator_count), - dim3(generator_t::jump_ahead_thread_count), - 0, - 0, - d_engines1, - seed, - d_mt19937_jump); - }); + rocrand_host::detail::dynamic_dispatch( + ROCRAND_ORDERING_PSEUDO_DEFAULT, + [&](auto is_dynamic) + { + rocrand_status status = rocrand_system_device::template launch< + rocrand_host::detail::jump_ahead_mt19937, + rocrand_host::detail::static_block_size_config_provider>( + dim3(generator_count), + dim3(generator_t::jump_ahead_thread_count), + 0, + 0, + d_engines1, + seed, + d_mt19937_jump); + }); std::vector h_engines1(generator_count * n); HIP_CHECK(hipMemcpy(h_engines1.data(), From 8bb422fdedc4b3bf10e887c88f147acd4e0aaeed Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Mon, 19 Feb 2024 08:09:15 +0000 Subject: [PATCH 14/54] update test --- test/internal/test_rocrand_mt19937_prng.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index 9158f3764..c40e947c0 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -739,7 +739,7 @@ TYPED_TEST(mt19937_generator_engine_tests, subsequence_test) using ConfigProvider = default_config_provider; rocrand_status status = rocrand_system_device::template launch< - rocrand_host::detail::jump_ahead_mt19937, + rocrand_host::detail::jump_ahead_mt19937, rocrand_host::detail::static_block_size_config_provider>( dim3(generator_count), dim3(generator_t::jump_ahead_thread_count), From b7698f8e3d851ed6b5dc6195aa5eb5716e265d72 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Mon, 19 Feb 2024 12:01:01 +0000 Subject: [PATCH 15/54] move jump_ahead_thread_count back to template param --- library/src/rng/mt19937.hpp | 12 ++++++------ test/internal/test_rocrand_mt19937_prng.cpp | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index cafe23c2e..3a5d8da2c 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -80,11 +80,11 @@ __forceinline__ __device__ unsigned int wrap_n(unsigned int i) // Config is not actually used for kernel launch here, but is needed to check the number of generators // As this kernel is not dependent on any type just use void for the config, as mt19937 is not tuned for types independently, so all configs are the same for different types. -template +template __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, dim3 thread_idx, dim3 /*grid_dim*/, - dim3 block_dim, + dim3 /*block_dim*/, unsigned int* __restrict__ engines, unsigned long long seed, const unsigned int* __restrict__ jump) @@ -96,9 +96,9 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, && mt19937_jumps_radixes == 2, "Not enough rocrand_h_mt19937_jump values to initialize all generators"); - const unsigned int block_size = block_dim.x; - const unsigned int items_per_thread = (mt19937_constants::n + block_size - 1) / block_size; - const unsigned int tail_n = mt19937_constants::n - (items_per_thread - 1) * block_size; + constexpr unsigned int block_size = jump_ahead_thread_count; + constexpr unsigned int items_per_thread = (mt19937_constants::n + block_size - 1) / block_size; + constexpr unsigned int tail_n = mt19937_constants::n - (items_per_thread - 1) * block_size; __shared__ unsigned int temp[mt19937_constants::n]; unsigned int state[items_per_thread]; @@ -649,7 +649,7 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base [&, this](auto is_dynamic) { status = system_type::template launch< - rocrand_host::detail::jump_ahead_mt19937, + rocrand_host::detail::jump_ahead_mt19937, rocrand_host::detail::static_block_size_config_provider< jump_ahead_thread_count>>(dim3(m_generator_count), dim3(jump_ahead_thread_count), diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index c40e947c0..bf1ac9224 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -739,7 +739,7 @@ TYPED_TEST(mt19937_generator_engine_tests, subsequence_test) using ConfigProvider = default_config_provider; rocrand_status status = rocrand_system_device::template launch< - rocrand_host::detail::jump_ahead_mt19937, + rocrand_host::detail::jump_ahead_mt19937, rocrand_host::detail::static_block_size_config_provider>( dim3(generator_count), dim3(generator_t::jump_ahead_thread_count), @@ -1153,7 +1153,7 @@ TYPED_TEST(mt19937_generator_engine_tests, jump_ahead_test) [&](auto is_dynamic) { rocrand_status status = rocrand_system_device::template launch< - rocrand_host::detail::jump_ahead_mt19937, + rocrand_host::detail::jump_ahead_mt19937, rocrand_host::detail::static_block_size_config_provider>( dim3(generator_count), dim3(generator_t::jump_ahead_thread_count), From 10b79e88ef7a1c3184dfdba8252349564b29882a Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 22 Feb 2024 08:24:21 +0000 Subject: [PATCH 16/54] implement memcpy in host and device systems --- library/src/rng/mt19937.hpp | 35 ++++++----------------------------- library/src/rng/system.hpp | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 3a5d8da2c..039ee2224 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -623,21 +623,16 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base unsigned int* d_mt19937_jump{}; status = system_type::alloc(&d_mt19937_jump, sizeof(rocrand_h_mt19937_jump)); if(status != ROCRAND_STATUS_SUCCESS) - { - return status; - } - err = hipMalloc(&d_mt19937_jump, sizeof(rocrand_h_mt19937_jump)); - if(err != hipSuccess) { system_type::free(d_engines); - return ROCRAND_STATUS_ALLOCATION_FAILED; + return status; } - err = hipMemcpy(d_mt19937_jump, - rocrand_h_mt19937_jump, + status = system_type::memcpy(d_mt19937_jump, + (rocrand_h_mt19937_jump), sizeof(rocrand_h_mt19937_jump), hipMemcpyHostToDevice); - if(err != hipSuccess) + if(status != ROCRAND_STATUS_SUCCESS) { system_type::free(d_engines); system_type::free(d_mt19937_jump); @@ -660,16 +655,10 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base d_mt19937_jump); }); if(status != ROCRAND_STATUS_SUCCESS) - { - return status; - } - - err = hipStreamSynchronize(m_stream); - if(err != hipSuccess) { system_type::free(d_engines); system_type::free(d_mt19937_jump); - return ROCRAND_STATUS_LAUNCH_FAILURE; + return status; } system_type::free(d_mt19937_jump); @@ -689,15 +678,9 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base d_engines); }); if(status != ROCRAND_STATUS_SUCCESS) - { - return status; - } - - err = hipStreamSynchronize(m_stream); - if(err != hipSuccess) { system_type::free(d_engines); - return ROCRAND_STATUS_LAUNCH_FAILURE; + return status; } system_type::free(d_engines); @@ -834,12 +817,6 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base } } - // check kernel status - if(hipGetLastError() != hipSuccess) - { - return ROCRAND_STATUS_LAUNCH_FAILURE; - } - m_start_input = (m_start_input + vec_size + extra) % full_stride; m_prev_input_width = input_width; diff --git a/library/src/rng/system.hpp b/library/src/rng/system.hpp index 793d245cc..0f4562f42 100644 --- a/library/src/rng/system.hpp +++ b/library/src/rng/system.hpp @@ -42,6 +42,7 @@ #define ROCRAND_USE_PARALLEL_STL #include #endif +#include #include #include @@ -76,6 +77,12 @@ struct host_system delete[] ptr; } + static rocrand_status memcpy(void * dst, const void * src, size_t size, hipMemcpyKind /*kind*/) + { + std::memcpy(dst, src, size); + return ROCRAND_STATUS_SUCCESS; + } + template struct KernelArgs { @@ -208,6 +215,16 @@ struct device_system ROCRAND_HIP_FATAL_ASSERT(hipFree(ptr)); } + static rocrand_status memcpy(void * dst, const void * src, size_t size, hipMemcpyKind kind) + { + hipError_t error = hipMemcpy(dst, src, size, kind); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + return ROCRAND_STATUS_SUCCESS; + } + template, From 898dbc6f3069bdff58020bfead3fb0f94349951f Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Fri, 1 Mar 2024 15:21:32 +0000 Subject: [PATCH 17/54] fix compilation issues and segfault --- library/src/rng/mt19937.hpp | 49 ++++++++++++++++++--- library/src/rng/mt19937_octo_engine.hpp | 12 ++++- library/src/rng/system.hpp | 1 + test/internal/test_rocrand_mt19937_prng.cpp | 2 +- 4 files changed, 54 insertions(+), 10 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 039ee2224..21310e389 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -73,7 +73,7 @@ namespace rocrand_impl::host { /// Computes i % n, i must be in range [0, 2 * n) -__forceinline__ __device__ unsigned int wrap_n(unsigned int i) +__forceinline__ __device__ __host__ unsigned int wrap_n(unsigned int i) { return i - (i < mt19937_constants::n ? 0 : mt19937_constants::n); } @@ -84,7 +84,7 @@ template> 32) ^ seed; temp[0] = seedu; @@ -114,7 +119,10 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, temp[i] = 1812433253 * (temp[i - 1] ^ (temp[i - 1] >> 30)) + i; } } + +#if defined(__HIP_DEVICE_COMPILE__) __syncthreads(); +#endif for(unsigned int i = 0; i < items_per_thread; i++) { @@ -123,7 +131,10 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, state[i] = temp[i * block_size + thread_idx.x]; } } + +#if defined(__HIP_DEVICE_COMPILE__) __syncthreads(); +#endif const unsigned int engine_id = block_idx.x; @@ -143,18 +154,26 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, // Compute jumping ahead with standard Horner method unsigned int ptr = 0; +#if defined(__HIP_DEVICE_COMPILE__) for(unsigned int i = thread_idx.x; i < mt19937_constants::n; i += block_size) +#else + for(unsigned int i = 0; i < mt19937_constants::n; ++i) +#endif { temp[i] = 0; } +#if defined(__HIP_DEVICE_COMPILE__) __syncthreads(); +#endif const unsigned int* pf = jump + (r * (mt19937_jumps_radix - 1) + radix - 1) * mt19937_p_size; for(int pfi = mt19937_constants::mexp - 1; pfi >= 0; pfi--) { // Generate next state +#if defined(__HIP_DEVICE_COMPILE__) if(thread_idx.x == 0) +#endif { unsigned int t0 = temp[ptr]; unsigned int t1 = temp[wrap_n(ptr + 1)]; @@ -163,7 +182,9 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, = (t0 & mt19937_constants::upper_mask) | (t1 & mt19937_constants::lower_mask); temp[ptr] = tm ^ (y >> 1) ^ ((y & 0x1U) ? mt19937_constants::matrix_a : 0); } +#if defined(__HIP_DEVICE_COMPILE__) __syncthreads(); +#endif ptr = wrap_n(ptr + 1); if((pf[pfi / 32] >> (pfi % 32)) & 1) @@ -171,12 +192,21 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, // Add state to temp for(unsigned int i = 0; i < items_per_thread; i++) { - if(i < items_per_thread - 1 || thread_idx.x < tail_n) +#if defined(__HIP_DEVICE_COMPILE__) + unsigned int& j = thread_idx.x; +#else + for (unsigned int j = 0; j < block_dim.x; ++j) +#endif { - temp[wrap_n(ptr + i * block_size + thread_idx.x)] ^= state[i]; + if(i < items_per_thread - 1 || j < tail_n) + { + temp[wrap_n(ptr + i * block_size + j)] ^= state[i]; + } } } +#if defined(__HIP_DEVICE_COMPILE__) __syncthreads(); +#endif } } @@ -188,7 +218,9 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, state[i] = temp[wrap_n(ptr + i * block_size + thread_idx.x)]; } } +#if defined(__HIP_DEVICE_COMPILE__) __syncthreads(); +#endif } // Save state @@ -225,7 +257,7 @@ __host__ __device__ inline void init_engines_mt19937(dim3 block_idx, mt19937_octo_engine_accessor accessor(octo_engines); mt19937_octo_engine engine; engine.gather( - &engines[thread_id / mt19937_octo_engine::threads_per_generator * mt19937_constants::n]); + &engines[thread_id / mt19937_octo_engine::threads_per_generator * mt19937_constants::n], thread_idx); accessor.save(thread_id, engine); } @@ -894,8 +926,11 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base }; using rocrand_mt19937 = rocrand_mt19937_template< - rocrand_system_device, + rocrand_system_host, default_config_provider>; +// using rocrand_mt19937_host = rocrand_mt19937_template< +// rocrand_system_host, +// rocrand_host::detail::default_config_provider>; } // namespace rocrand_impl::host diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index b0d71477c..b3ed7708a 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -136,14 +136,14 @@ struct mt19937_octo_engine static constexpr inline unsigned int i568 = 1 + items_per_thread * 10; /// Initialize the octo engine from the engine it shares with seven other threads. - __forceinline__ __device__ void gather(const unsigned int engine[mt19937_constants::n]) + __forceinline__ __device__ void gather(const unsigned int engine[mt19937_constants::n], dim3 thread_idx) { constexpr unsigned int off_cnt = 11; /// Used to map the \p mt19937_octo_state.mt indices to \p mt19937_state.mt indices. constexpr unsigned int offsets[off_cnt] = {1, 57, 114, 171, 227, 284, 341, 398, 454, 511, 568}; - const unsigned int tid = threadIdx.x & 7U; + const unsigned int tid = thread_idx.x & 7U; // initialize the elements that follow a regular pattern for(unsigned int i = 0; i < off_cnt; i++) @@ -222,6 +222,7 @@ struct mt19937_octo_engine /// Eights threads collaborate in computing the n next values. __forceinline__ __device__ void gen_next_n() { +#if defined(__HIP_DEVICE_COMPILE__) const unsigned int tid = threadIdx.x & 7U; // compute eleven vectors that follow a regular pattern and compute @@ -365,6 +366,13 @@ struct mt19937_octo_engine // needs [568, 623], [0, 0]', and [341, 396]' const unsigned int v000 = shuffle(m_state.mt[i000_0], 0); comp_vector(tid, i568, i341, v000); +#else + static constexpr unsigned int n = 1U + items_per_thread * 11U; + for (unsigned int i = 0; i < n; ++i) + { + m_state.mt[i] = comp(m_state.mt[i], m_state.mt[(i + 1) % n], m_state.mt[(i + mt19937_constants::m) % n]); + } +#endif } /// Return \p i state value without tempering diff --git a/library/src/rng/system.hpp b/library/src/rng/system.hpp index 0f4562f42..40090f009 100644 --- a/library/src/rng/system.hpp +++ b/library/src/rng/system.hpp @@ -160,6 +160,7 @@ struct host_system if constexpr(UseHostFunc) { hipError_t status = hipLaunchHostFunc(stream, kernel_callback, kernel_args); + hipStreamSynchronize(stream); if(status != hipSuccess) { diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index bf1ac9224..9ed14886a 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -543,7 +543,7 @@ __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_engines_k const unsigned int thread_id = blockIdx.x * blockDim.x + threadIdx.x; unsigned int engine_id = thread_id / mt19937_octo_engine::threads_per_generator; mt19937_octo_engine engine = octo_engines[thread_id]; - engine.gather(&engines[engine_id * n]); + engine.gather(&engines[engine_id * n], threadIdx); engine.gen_next_n(); if(engine_id == 0) From 3f8d454ee7f530e71eaa8bd131d1b30f21366807 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Thu, 16 May 2024 09:02:21 +0000 Subject: [PATCH 18/54] Removed 'apt-get install flang' --- .gitlab-ci.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 43d8243f9..d1f591894 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -132,12 +132,10 @@ copyright-date: - .rules:build needs: [] script: - - $SUDO_CMD apt-get install -y flang - cmake -S $CI_PROJECT_DIR -B $CI_PROJECT_DIR/build -G Ninja - -D CMAKE_FORTRAN_COMPILER=/usr/bin/flang -D CMAKE_CXX_COMPILER=${COMPILER} -D BUILD_TEST=ON -D BUILD_BENCHMARK=ON @@ -161,12 +159,10 @@ copyright-date: - .rules:build needs: [] script: - - $SUDO_CMD apt-get install -y flang - cmake -S $CI_PROJECT_DIR -B $CI_PROJECT_DIR/build -G Ninja - -D CMAKE_FORTRAN_COMPILER=/usr/bin/flang -D CMAKE_CXX_COMPILER=${COMPILER} -D CMAKE_CUDA_HOST_COMPILER=${COMPILER} -D BUILD_TEST=ON @@ -341,7 +337,6 @@ benchmark:benchmark-tuning-generate-results: - .rules:test stage: test script: - - $SUDO_CMD apt-get install -y flang - cd $CI_PROJECT_DIR/build # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue. # This causes the hipMemcpy to fail, which is not reported as an error by HIP. From 31298257a2cbec5b111be0fe69d566842e565aff Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Fri, 8 Mar 2024 10:47:31 +0000 Subject: [PATCH 19/54] fix jump_ahead on host --- library/src/rng/mt19937.hpp | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 21310e389..4ac687c8e 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -89,6 +89,13 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, unsigned long long seed, const unsigned int* __restrict__ jump) { +#if !defined(__HIP_DEVICE_COMPILE__) + if (thread_idx.x > 0) + { + return; + } +#endif + constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); constexpr unsigned int GeneratorCount = config.threads * config.blocks / mt19937_octo_engine::threads_per_generator; @@ -104,7 +111,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, __shared__ #endif unsigned int temp[mt19937_constants::n]; - unsigned int state[items_per_thread]; + unsigned int state[items_per_thread]; // Initialize state 0 (engine_id = 0) used as a base for all engines. // It uses a recurrence relation so one thread calculates all n values. @@ -126,9 +133,16 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, for(unsigned int i = 0; i < items_per_thread; i++) { - if(i < items_per_thread - 1 || thread_idx.x < tail_n) // Check only for the last iteration +#if defined(__HIP_DEVICE_COMPILE__) + unsigned int& j = thread_idx.x; +#else + for (unsigned int j = 0; j < block_dim.x; ++j) +#endif { - state[i] = temp[i * block_size + thread_idx.x]; + if(i < items_per_thread - 1 || j < tail_n) // Check only for the last iteration + { + state[i] = temp[i * block_size + j]; + } } } @@ -226,9 +240,16 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, // Save state for(unsigned int i = 0; i < items_per_thread; i++) { - if(i < items_per_thread - 1 || thread_idx.x < tail_n) +#if defined(__HIP_DEVICE_COMPILE__) + unsigned int& j = thread_idx.x; +#else + for (unsigned int j = 0; j < block_dim.x; ++j) +#endif { - engines[engine_id * mt19937_constants::n + i * block_size + thread_idx.x] = state[i]; + if(i < items_per_thread - 1 || j < tail_n) + { + engines[engine_id * mt19937_constants::n + i * block_size + j] = state[i]; + } } } } From a28f772c36f502900ef3ef2929c406b7366c8fa2 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 21 Mar 2024 10:39:01 +0000 Subject: [PATCH 20/54] create host implementation of some functions refactor generate_long_mt19937 to work on host as well --- library/src/rng/mt19937.hpp | 81 +++++++++++- library/src/rng/mt19937_octo_engine.hpp | 166 +++++++++++++++++++++++- 2 files changed, 237 insertions(+), 10 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 4ac687c8e..dc499f15c 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -386,6 +386,12 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, const unsigned int tail_size, Distribution distribution) { +#if !defined(__HIP_DEVICE_COMPILE__) + if (thread_idx.x % 8 != 0) + { + return; + } +#endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); constexpr unsigned int block_size = config.threads; constexpr unsigned int grid_size = config.blocks; @@ -403,14 +409,29 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, const unsigned int thread_id = block_idx.x * block_size + thread_idx.x; +#if defined(__HIP_DEVICE_COMPILE__) unsigned int input[input_width]; T output[output_width]; +#else + unsigned int inputs[8][input_width]; + T outputs[8][output_width]; +#endif // Workaround: since load() and store() use the same indices, the compiler decides to keep // computed addresses alive wasting 78 * 2 VGPRs. block_dim.x equals to block_size but it is // a runtime value so save() will compute new addresses. mt19937_octo_engine_accessor accessor(engines); + +#if defined(__HIP_DEVICE_COMPILE__) mt19937_octo_engine engine = accessor.load(block_idx.x * block_dim.x + thread_idx.x); +#else + mt19937_octo_engine thread_engines[8]; +#pragma unroll + for (size_t i = 0; i < 8; ++i) + { + thread_engines[i] = accessor.load(block_idx.x * block_dim.x + thread_idx.x + i); + } +#endif size_t base_index = 0; @@ -418,6 +439,14 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, // the end sequence, but not yet used. if(start_input > 0) { +#if !defined(__HIP_DEVICE_COMPILE__) +#pragma unroll + for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) + { + auto& input = inputs[warp_lane]; + auto& output = outputs[warp_lane]; + mt19937_octo_engine& engine = thread_engines[warp_lane]; +#endif #pragma unroll for(unsigned int j = 0; j < inputs_per_state; j++) { @@ -436,6 +465,9 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, vec_data[thread_index] = *reinterpret_cast(output); } } +#if !defined(__HIP_DEVICE_COMPILE__) + } +#endif base_index = full_stride - start_input; } @@ -443,7 +475,20 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, // in a fast unrolled loop. for(; base_index + full_stride <= vec_size; base_index += full_stride) { - engine.gen_next_n(); +// #if defined(__HIP_DEVICE_COMPILE__) +// engine.gen_next_n(); +// #else +// mt19937_octo_engine::gen_next_n(thread_engines); +// #endif + +#if !defined(__HIP_DEVICE_COMPILE__) +#pragma unroll + for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) + { + auto& input = inputs[warp_lane]; + auto& output = outputs[warp_lane]; + mt19937_octo_engine& engine = thread_engines[warp_lane]; +#endif #pragma unroll for(unsigned int j = 0; j < inputs_per_state; j++) { @@ -458,6 +503,9 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, const size_t thread_index = base_index + j * stride + thread_id; vec_data[thread_index] = *reinterpret_cast(output); } +#if !defined(__HIP_DEVICE_COMPILE__) + } +#endif } // Generate one extra VecT if data is not aligned by sizeof(VecT) or @@ -468,7 +516,20 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, if(base_index < vec_size + extra) { bool is_extra_thread = false; - engine.gen_next_n(); +// #if defined(__HIP_DEVICE_COMPILE__) +// engine.gen_next_n(); +// #else +// mt19937_octo_engine::gen_next_n(thread_engines); +// #endif + +#if !defined(__HIP_DEVICE_COMPILE__) +#pragma unroll + for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) + { + auto& input = inputs[warp_lane]; + auto& output = outputs[warp_lane]; + mt19937_octo_engine& engine = thread_engines[warp_lane]; +#endif #pragma unroll for(unsigned int j = 0; j < inputs_per_state; j++) { @@ -513,10 +574,20 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, } } } +#if !defined(__HIP_DEVICE_COMPILE__) + } +#endif } // save state +#if defined(__HIP_DEVICE_COMPILE__) accessor.save(thread_id, engine); +#else + for (size_t i = 0; i < 8; ++i) + { + accessor.save(block_idx.x * block_dim.x + thread_idx.x + i, thread_engines[i]); + } +#endif } } // end namespace rocrand_host::detail @@ -947,11 +1018,11 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base }; using rocrand_mt19937 = rocrand_mt19937_template< + rocrand_system_device, + default_config_provider>; +using rocrand_mt19937_host = rocrand_mt19937_template< rocrand_system_host, default_config_provider>; -// using rocrand_mt19937_host = rocrand_mt19937_template< -// rocrand_system_host, -// rocrand_host::detail::default_config_provider>; } // namespace rocrand_impl::host diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index b3ed7708a..8d20e122c 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -219,6 +219,35 @@ struct mt19937_octo_engine m_state.mt[idx_i + j] = comp(m_state.mt[idx_i + j], last_dep, m_state.mt[idx_m + j]); } + static void comp_vector(unsigned int idx_i, + unsigned int idx_m, + unsigned int last_dep_tid_7, + mt19937_octo_engine thread_engines[8]) + { + // communicate the dependency for the last value + unsigned int last_deps[8]; + for (int i = 0; i < 8; ++i) + { + last_deps[i] = thread_engines[(i + 1) % 8].m_state.mt[idx_i]; + } + + for (int i = 0; i < 8; ++i) + { + // thread 7 needs a special value that does not fit the pattern + unsigned int last_dep = i == 7 ? last_dep_tid_7 : last_deps[i]; + + unsigned int j; + for(j = 0; j < items_per_thread - 1; j++) + { + // compute (i + ipt * i + j)': needs (i + ipt * i + 1 + j) % n and (i + ipt * i + m + j) % n + thread_engines[i].m_state.mt[idx_i + j] + = comp(thread_engines[i].m_state.mt[idx_i + j], thread_engines[i].m_state.mt[idx_i + j + 1], thread_engines[i].m_state.mt[idx_m + j]); + } + // compute the last value using the communicated dependency + thread_engines[i].m_state.mt[idx_i + j] = comp(thread_engines[i].m_state.mt[idx_i + j], last_dep, thread_engines[i].m_state.mt[idx_m + j]); + } + } + /// Eights threads collaborate in computing the n next values. __forceinline__ __device__ void gen_next_n() { @@ -366,13 +395,140 @@ struct mt19937_octo_engine // needs [568, 623], [0, 0]', and [341, 396]' const unsigned int v000 = shuffle(m_state.mt[i000_0], 0); comp_vector(tid, i568, i341, v000); -#else - static constexpr unsigned int n = 1U + items_per_thread * 11U; - for (unsigned int i = 0; i < n; ++i) + +#endif + } + + static void gen_next_n(mt19937_octo_engine thread_engines[8]) + { + const unsigned int tid = threadIdx.x & 7U; + + // compute eleven vectors that follow a regular pattern and compute + // eight special values for a total of n new elements. + // ' indicates new value + + // compute 0': needs 1 and 397 + const unsigned int v397 = thread_engines[5].m_state.mt[i397_5]; + thread_engines[0].m_state.mt[i000_0] = comp(thread_engines[0].m_state.mt[i000_0], thread_engines[0].m_state.mt[i001], v397); + + // compute [ 1 + i * ipt, 1 + ipt * (i + 1))' = [ 1, 56]': + // needs [ 1, 57] and [398, 453] + const unsigned int v057 = thread_engines[0].m_state.mt[i057]; + comp_vector(i001, i398, v057, thread_engines); + + // compute [ 57 + i * ipt, 57 + ipt * (i + 1))' = [ 57, 112]': + // needs [ 57, 113] and [454, 509] + const unsigned int v113 = thread_engines[1].m_state.mt[i113_1]; + comp_vector(i057, i454, v113, thread_engines); + + // compute 113': needs 114 and 510 + const unsigned int v114 = thread_engines[0].m_state.mt[i114]; + const unsigned int v510 = thread_engines[6].m_state.mt[i510_6]; + thread_engines[1].m_state.mt[i113_1] = comp(thread_engines[1].m_state.mt[i113_1], v114, v510); + + // compute [114 + i * ipt, 114 + ipt * (i + 1))' = [114, 169]': + // needs [114, 170] and [511, 566] + const unsigned int v170 = thread_engines[2].m_state.mt[i170_2]; + comp_vector(i114, i511, v170, thread_engines); + + // compute 170': needs 171 and 567 + const unsigned int v171 = thread_engines[0].m_state.mt[i171]; + const unsigned int v567 = thread_engines[7].m_state.mt[i567_7]; + thread_engines[2].m_state.mt[i170_2] = comp(thread_engines[2].m_state.mt[i170_2], v171, v567); + + // compute [171 + i * ipt, 171 + ipt * (i + 1))' = [171, 226]': + // needs [171, 227] and [568, 623] + const unsigned int v227 = thread_engines[0].m_state.mt[i227]; + comp_vector(i171, i568, v227, thread_engines); + + // compute [227 + i * ipt, 227 + ipt * (i + 1))' = [227, 282]': + // needs [227, 283] and [ 0, 55]' + const unsigned int v283 = thread_engines[3].m_state.mt[i283_3]; + // comp_vector(tid, i227, s_000, v283); + // written out below, since value 0 does not fit the regular pattern { - m_state.mt[i] = comp(m_state.mt[i], m_state.mt[(i + 1) % n], m_state.mt[(i + mt19937_constants::m) % n]); + unsigned int last_deps[8]; + unsigned int first_deps[8]; + + for (int i = 0; i < 8; ++i) + { + last_deps[i] = thread_engines[(i + 1) % 8].m_state.mt[i227]; + first_deps[i] = thread_engines[(i - 1) % 8].m_state.mt[i001 + items_per_thread - 1]; + } + + for (int i = 0; i < 8; ++i) + { + // communicate the dependency for the first and last value + unsigned int last_dep = i == 7 ? v283 : last_dep; + unsigned int first_dep = i == 0 ? thread_engines[0].m_state.mt[i000_0] : first_dep; + + // extract the first and last iterations from the loop + unsigned int j = 0; + thread_engines[i].m_state.mt[i227 + j] = comp(thread_engines[i].m_state.mt[i227 + j], thread_engines[i].m_state.mt[i227 + j + 1], first_dep); + for(j = 1; j < items_per_thread - 1; j++) + { + thread_engines[i].m_state.mt[i227 + j] = comp(thread_engines[i].m_state.mt[i227 + j], + thread_engines[i].m_state.mt[i227 + j + 1], + thread_engines[i].m_state.mt[i001 + j - 1]); + } + thread_engines[i].m_state.mt[i227 + j] = comp(thread_engines[i].m_state.mt[i227 + j], last_dep, thread_engines[i].m_state.mt[i001 + j - 1]); + } } -#endif + + // compute 283': needs 284 and 56' + const unsigned int v284 = thread_engines[0].m_state.mt[i284]; + const unsigned int v056 = thread_engines[7].m_state.mt[i001 + 6]; // 1 + 7 * 7 + 6 = 56 + thread_engines[3].m_state.mt[i283_3] = comp(thread_engines[3].m_state.mt[i283_3], v284, v056); + + // compute [284 + i * ipt, 284 + ipt * (i + 1))' = [284, 339]': + // needs [284, 340] and [ 57, 112]' + const unsigned int v340 = thread_engines[4].m_state.mt[i340_4]; + comp_vector(i284, i057, v340, thread_engines); + + // compute 340': needs 341 and 113' + const unsigned int v113_ = thread_engines[1].m_state.mt[i113_1]; + const unsigned int v341 = thread_engines[0].m_state.mt[i341]; + thread_engines[4].m_state.mt[i340_4] = comp(thread_engines[4].m_state.mt[i340_4], v341, v113_); + + // compute [341 + i * ipt, 341 + ipt * (i + 1))' = [341, 396]': + // needs [341, 397] and [114, 169]' + const unsigned int v397_ = thread_engines[5].m_state.mt[i397_5]; + comp_vector(i341, i114, v397_, thread_engines); + + // compute 397': needs 398 and 170' + const unsigned int v398 = thread_engines[0].m_state.mt[i398]; + const unsigned int v170_ = thread_engines[2].m_state.mt[i170_2]; + thread_engines[5].m_state.mt[i397_5] = comp(thread_engines[5].m_state.mt[i397_5], v398, v170_); + + // compute [398 + i * ipt, 398 + ipt * (i + 1))' = [398, 453]': + // needs [398, 454] and [171, 226]' + const unsigned int v454 = thread_engines[0].m_state.mt[i454]; + comp_vector(i398, i171, v454, thread_engines); + + // compute [454 + i * ipt, 454 + ipt * (i + 1))' = [454, 509]': + // needs [454, 510] and [227, 282]' + const unsigned int v510_ = thread_engines[6].m_state.mt[i510_6]; + comp_vector(i454, i227, v510_, thread_engines); + + // compute 510': needs 511 and 283' + const unsigned int v511 = thread_engines[0].m_state.mt[i511]; + const unsigned int v283_ = thread_engines[3].m_state.mt[i283_3]; + thread_engines[6].m_state.mt[i510_6] = comp(thread_engines[6].m_state.mt[i510_6], v511, v283_); + + // compute [511 + i * ipt, 511 + ipt * (i + 1))' = [511, 566]': + // needs [511, 567] and [284, 339]' + const unsigned int v567_ = thread_engines[7].m_state.mt[i567_7]; + comp_vector(i511, i284, v567_, thread_engines); + + // compute 567': needs 568 and 340' + const unsigned int v568 = thread_engines[0].m_state.mt[i568]; + const unsigned int i340 = thread_engines[4].m_state.mt[i340_4]; + thread_engines[7].m_state.mt[i567_7] = comp(thread_engines[7].m_state.mt[i567_7], v568, i340); + + // compute [568 + i * ipt, 568 + ipt * (i + 1))' = [568, 623]': + // needs [568, 623], [0, 0]', and [341, 396]' + const unsigned int v000 = thread_engines[0].m_state.mt[i000_0]; + comp_vector(i568, i341, v000, thread_engines); } /// Return \p i state value without tempering From 3a0bca68cc4598de68f36a02082478f09647f09b Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 28 Mar 2024 15:03:16 +0000 Subject: [PATCH 21/54] fix remaining inconsistencies in host mt19937 generator --- library/src/rng/mt19937.hpp | 83 +++++++++++++++++++++---- library/src/rng/mt19937_octo_engine.hpp | 5 -- library/src/rocrand.cpp | 2 + test/test_rocrand_host.cpp | 1 + 4 files changed, 73 insertions(+), 18 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index dc499f15c..cc8bf3fa1 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -108,10 +108,13 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, constexpr unsigned int tail_n = mt19937_constants::n - (items_per_thread - 1) * block_size; #if defined(__HIP_DEVICE_COMPILE__) - __shared__ + __shared__ unsigned int temp[mt19937_constants::n]; + unsigned int state[items_per_thread]; + +#else + unsigned int temp[mt19937_constants::n]; + unsigned int states[block_size][items_per_thread]; #endif - unsigned int temp[mt19937_constants::n]; - unsigned int state[items_per_thread]; // Initialize state 0 (engine_id = 0) used as a base for all engines. // It uses a recurrence relation so one thread calculates all n values. @@ -137,13 +140,16 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, unsigned int& j = thread_idx.x; #else for (unsigned int j = 0; j < block_dim.x; ++j) -#endif { + auto& state = states[j]; +#endif if(i < items_per_thread - 1 || j < tail_n) // Check only for the last iteration { state[i] = temp[i * block_size + j]; } +#if !defined(__HIP_DEVICE_COMPILE__) } +#endif } #if defined(__HIP_DEVICE_COMPILE__) @@ -210,13 +216,16 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, unsigned int& j = thread_idx.x; #else for (unsigned int j = 0; j < block_dim.x; ++j) -#endif { + auto& state = states[j]; +#endif if(i < items_per_thread - 1 || j < tail_n) { temp[wrap_n(ptr + i * block_size + j)] ^= state[i]; } +#if !defined(__HIP_DEVICE_COMPILE__) } +#endif } #if defined(__HIP_DEVICE_COMPILE__) __syncthreads(); @@ -225,14 +234,23 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, } // Jump of the next power of 2 will be applied to the current state - for(unsigned int i = 0; i < items_per_thread; i++) +#if defined(__HIP_DEVICE_COMPILE__) + unsigned int& j = thread_idx.x; +#else + for (unsigned int j = 0; j < block_dim.x; ++j) { - if(i < items_per_thread - 1 || thread_idx.x < tail_n) + auto& state = states[j]; +#endif + for(unsigned int i = 0; i < items_per_thread; i++) { - state[i] = temp[wrap_n(ptr + i * block_size + thread_idx.x)]; + if(i < items_per_thread - 1 || j < tail_n) + { + state[i] = temp[wrap_n(ptr + i * block_size + j)]; + } } +#if !defined(__HIP_DEVICE_COMPILE__) } -#if defined(__HIP_DEVICE_COMPILE__) +#else __syncthreads(); #endif } @@ -244,13 +262,16 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, unsigned int& j = thread_idx.x; #else for (unsigned int j = 0; j < block_dim.x; ++j) -#endif { + auto& state = states[j]; +#endif if(i < items_per_thread - 1 || j < tail_n) { engines[engine_id * mt19937_constants::n + i * block_size + j] = state[i]; } +#if !defined(__HIP_DEVICE_COMPILE__) } +#endif } } @@ -297,6 +318,12 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, const unsigned int tail_size, Distribution distribution) { +#if !defined(__HIP_DEVICE_COMPILE__) + if (thread_idx.x % 8 != 0) + { + return; + } +#endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); constexpr unsigned int block_size = config.threads; constexpr unsigned int grid_size = config.blocks; @@ -308,10 +335,17 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, constexpr unsigned int input_width = Distribution::input_width; constexpr unsigned int output_width = Distribution::output_width; +#if defined(__HIP_DEVICE_COMPILE__) const unsigned int thread_id = block_idx.x * block_size + thread_idx.x; +#endif +#if defined(__HIP_DEVICE_COMPILE__) unsigned int input[input_width]; T output[output_width]; +#else + unsigned int inputs[8][input_width]; + T outputs[8][output_width]; +#endif // Generate one extra VecT if data is not aligned by sizeof(VecT) or // size % output_width != 0 @@ -326,6 +360,15 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, const unsigned int j_end = (start_input + vec_size + extra + stride - 1) / stride; for(unsigned int j = j_start; j < j_end; j++) { +#if !defined(__HIP_DEVICE_COMPILE__) +#pragma unroll + for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) + { + auto& input = inputs[warp_lane]; + auto& output = outputs[warp_lane]; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; +#endif + if(j * stride + thread_id >= start_input && j * stride + thread_id - start_input < vec_size + extra) { @@ -349,10 +392,19 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, vec_data[thread_index] = *reinterpret_cast(output); } } +#if !defined(__HIP_DEVICE_COMPILE__) + } +#endif } if constexpr(output_width > 1) { +#if !defined(__HIP_DEVICE_COMPILE__) +#pragma unroll + for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) + { + auto& output = outputs[warp_lane]; +#endif // Save head and tail, output was generated earlier if(is_extra_thread) { @@ -368,6 +420,9 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, } } } +#if !defined(__HIP_DEVICE_COMPILE__) + } +#endif } } @@ -407,9 +462,8 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, constexpr unsigned int stride = block_size * grid_size; constexpr unsigned int full_stride = stride * inputs_per_state; - const unsigned int thread_id = block_idx.x * block_size + thread_idx.x; - #if defined(__HIP_DEVICE_COMPILE__) + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x; unsigned int input[input_width]; T output[output_width]; #else @@ -446,6 +500,7 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, auto& input = inputs[warp_lane]; auto& output = outputs[warp_lane]; mt19937_octo_engine& engine = thread_engines[warp_lane]; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; #endif #pragma unroll for(unsigned int j = 0; j < inputs_per_state; j++) @@ -488,6 +543,7 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, auto& input = inputs[warp_lane]; auto& output = outputs[warp_lane]; mt19937_octo_engine& engine = thread_engines[warp_lane]; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; #endif #pragma unroll for(unsigned int j = 0; j < inputs_per_state; j++) @@ -529,6 +585,7 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, auto& input = inputs[warp_lane]; auto& output = outputs[warp_lane]; mt19937_octo_engine& engine = thread_engines[warp_lane]; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; #endif #pragma unroll for(unsigned int j = 0; j < inputs_per_state; j++) @@ -753,7 +810,7 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base } status = system_type::memcpy(d_mt19937_jump, - (rocrand_h_mt19937_jump), + rocrand_h_mt19937_jump, sizeof(rocrand_h_mt19937_jump), hipMemcpyHostToDevice); if(status != ROCRAND_STATUS_SUCCESS) diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index 8d20e122c..0d75ab784 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -251,7 +251,6 @@ struct mt19937_octo_engine /// Eights threads collaborate in computing the n next values. __forceinline__ __device__ void gen_next_n() { -#if defined(__HIP_DEVICE_COMPILE__) const unsigned int tid = threadIdx.x & 7U; // compute eleven vectors that follow a regular pattern and compute @@ -395,14 +394,10 @@ struct mt19937_octo_engine // needs [568, 623], [0, 0]', and [341, 396]' const unsigned int v000 = shuffle(m_state.mt[i000_0], 0); comp_vector(tid, i568, i341, v000); - -#endif } static void gen_next_n(mt19937_octo_engine thread_engines[8]) { - const unsigned int tid = threadIdx.x & 7U; - // compute eleven vectors that follow a regular pattern and compute // eight special values for a total of n new elements. // ' indicates new value diff --git a/library/src/rocrand.cpp b/library/src/rocrand.cpp index f3b81437e..81e77d506 100644 --- a/library/src/rocrand.cpp +++ b/library/src/rocrand.cpp @@ -80,6 +80,8 @@ rocrand_status create_generator_host(rocrand_generator* generator, rocrand_rng_t *generator = new generator_type>(); break; case ROCRAND_RNG_PSEUDO_MT19937: + *generator = new rocrand_generator_type>(); + break; default: return ROCRAND_STATUS_TYPE_ERROR; } diff --git a/test/test_rocrand_host.cpp b/test/test_rocrand_host.cpp index f481f61ff..bc5c53751 100644 --- a/test/test_rocrand_host.cpp +++ b/test/test_rocrand_host.cpp @@ -52,6 +52,7 @@ constexpr rocrand_rng_type host_rng_types[] = { ROCRAND_RNG_PSEUDO_LFSR113, ROCRAND_RNG_PSEUDO_MRG31K3P, ROCRAND_RNG_PSEUDO_MRG32K3A, + ROCRAND_RNG_PSEUDO_MT19937, ROCRAND_RNG_PSEUDO_MTGP32, ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, From 7201795f4a6d647079de8e44eb3259f7341799c6 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 4 Apr 2024 13:45:27 +0000 Subject: [PATCH 22/54] fix format end compile errors --- benchmark/tuning/benchmarked_generators.hpp | 2 +- library/src/rng/mt19937.hpp | 268 ++++++++++---------- library/src/rng/mt19937_octo_engine.hpp | 76 ++++-- library/src/rng/system.hpp | 10 +- test/internal/test_rocrand_mt19937_prng.cpp | 38 ++- 5 files changed, 219 insertions(+), 175 deletions(-) diff --git a/benchmark/tuning/benchmarked_generators.hpp b/benchmark/tuning/benchmarked_generators.hpp index 16031a850..f50133d0f 100644 --- a/benchmark/tuning/benchmarked_generators.hpp +++ b/benchmark/tuning/benchmarked_generators.hpp @@ -107,7 +107,7 @@ using mtgp32_generator_template ConfigProvider>; template -using mt19937_generator_template = rocrand_impl::host::mt19937_generator_template; +using mt19937_generator_template = rocrand_impl::host::mt19937_generator_template; template using philox4x32_10_generator_template diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index cc8bf3fa1..e18aa2499 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -84,13 +84,13 @@ template 0) + if(thread_idx.x > 0) { return; } @@ -134,12 +134,13 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, __syncthreads(); #endif + // clang-format off for(unsigned int i = 0; i < items_per_thread; i++) { #if defined(__HIP_DEVICE_COMPILE__) unsigned int& j = thread_idx.x; #else - for (unsigned int j = 0; j < block_dim.x; ++j) + for(unsigned int j = 0; j < block_dim.x; ++j) { auto& state = states[j]; #endif @@ -151,6 +152,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, } #endif } + // clang-format on #if defined(__HIP_DEVICE_COMPILE__) __syncthreads(); @@ -162,6 +164,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, // i * 2 ^ 1000 and mt19937_jumps_radix * i * 2 ^ 1000 values // where i is in range [1; mt19937_jumps_radix). unsigned int e = engine_id; + // clang-format off for(unsigned int r = 0; r < mt19937_jumps_radixes; r++) { const unsigned int radix = e % mt19937_jumps_radix; @@ -215,7 +218,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, #if defined(__HIP_DEVICE_COMPILE__) unsigned int& j = thread_idx.x; #else - for (unsigned int j = 0; j < block_dim.x; ++j) + for(unsigned int j = 0; j < block_dim.x; ++j) { auto& state = states[j]; #endif @@ -237,7 +240,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, #if defined(__HIP_DEVICE_COMPILE__) unsigned int& j = thread_idx.x; #else - for (unsigned int j = 0; j < block_dim.x; ++j) + for(unsigned int j = 0; j < block_dim.x; ++j) { auto& state = states[j]; #endif @@ -261,7 +264,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, #if defined(__HIP_DEVICE_COMPILE__) unsigned int& j = thread_idx.x; #else - for (unsigned int j = 0; j < block_dim.x; ++j) + for(unsigned int j = 0; j < block_dim.x; ++j) { auto& state = states[j]; #endif @@ -273,6 +276,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, } #endif } + // clang-format on } // This kernel is not explicitly tuned, but uses the same configs as the generate-kernels. @@ -299,7 +303,8 @@ __host__ __device__ inline void init_engines_mt19937(dim3 block_idx, mt19937_octo_engine_accessor accessor(octo_engines); mt19937_octo_engine engine; engine.gather( - &engines[thread_id / mt19937_octo_engine::threads_per_generator * mt19937_constants::n], thread_idx); + &engines[thread_id / mt19937_octo_engine::threads_per_generator * mt19937_constants::n], + thread_idx); accessor.save(thread_id, engine); } @@ -319,9 +324,9 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, Distribution distribution) { #if !defined(__HIP_DEVICE_COMPILE__) - if (thread_idx.x % 8 != 0) + if(thread_idx.x % 8 != 0) { - return; + return; } #endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); @@ -343,8 +348,8 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, unsigned int input[input_width]; T output[output_width]; #else - unsigned int inputs[8][input_width]; - T outputs[8][output_width]; + unsigned int inputs[8][input_width]; + T outputs[8][output_width]; #endif // Generate one extra VecT if data is not aligned by sizeof(VecT) or @@ -358,40 +363,41 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, // it is beneficial to calculate what iterations will actually write data. const unsigned int j_start = start_input / stride; const unsigned int j_end = (start_input + vec_size + extra + stride - 1) / stride; + // clang-format off for(unsigned int j = j_start; j < j_end; j++) { #if !defined(__HIP_DEVICE_COMPILE__) -#pragma unroll + #pragma unroll for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) { - auto& input = inputs[warp_lane]; - auto& output = outputs[warp_lane]; + auto& input = inputs[warp_lane]; + auto& output = outputs[warp_lane]; const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; #endif - if(j * stride + thread_id >= start_input - && j * stride + thread_id - start_input < vec_size + extra) - { - mt19937_octo_engine_accessor accessor(engines); -#pragma unroll - for(unsigned int i = 0; i < input_width; i++) + if(j * stride + thread_id >= start_input + && j * stride + thread_id - start_input < vec_size + extra) { - input[i] = mt19937_octo_engine::temper( - accessor.load_value(thread_id, j * input_width + i)); - } + mt19937_octo_engine_accessor accessor(engines); +#pragma unroll + for(unsigned int i = 0; i < input_width; i++) + { + input[i] = mt19937_octo_engine::temper( + accessor.load_value(thread_id, j * input_width + i)); + } - distribution(input, output); + distribution(input, output); - const size_t thread_index = j * stride + thread_id - start_input; + const size_t thread_index = j * stride + thread_id - start_input; - // Mark an extra thread that will write head and tail - is_extra_thread = thread_index == vec_size + extra - 1; + // Mark an extra thread that will write head and tail + is_extra_thread = thread_index == vec_size + extra - 1; - if(thread_index < vec_size) - { - vec_data[thread_index] = *reinterpret_cast(output); + if(thread_index < vec_size) + { + vec_data[thread_index] = *reinterpret_cast(output); + } } - } #if !defined(__HIP_DEVICE_COMPILE__) } #endif @@ -400,30 +406,31 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, if constexpr(output_width > 1) { #if !defined(__HIP_DEVICE_COMPILE__) -#pragma unroll + #pragma unroll for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) { auto& output = outputs[warp_lane]; #endif - // Save head and tail, output was generated earlier - if(is_extra_thread) - { - for(unsigned int o = 0; o < output_width; o++) + // Save head and tail, output was generated earlier + if(is_extra_thread) { - if(o < head_size) - { - data[o] = output[o]; - } - if(o > output_width - tail_size - 1) + for(unsigned int o = 0; o < output_width; o++) { - data[size + (output_width - tail_size - 1) - o] = output[o]; + if(o < head_size) + { + data[o] = output[o]; + } + if(o > output_width - tail_size - 1) + { + data[size + (output_width - tail_size - 1) - o] = output[o]; + } } } - } #if !defined(__HIP_DEVICE_COMPILE__) } #endif } + // clang-format on } template @@ -442,9 +449,9 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, Distribution distribution) { #if !defined(__HIP_DEVICE_COMPILE__) - if (thread_idx.x % 8 != 0) + if(thread_idx.x % 8 != 0) { - return; + return; } #endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); @@ -467,8 +474,8 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, unsigned int input[input_width]; T output[output_width]; #else - unsigned int inputs[8][input_width]; - T outputs[8][output_width]; + unsigned int inputs[8][input_width]; + T outputs[8][output_width]; #endif // Workaround: since load() and store() use the same indices, the compiler decides to keep @@ -480,10 +487,10 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, mt19937_octo_engine engine = accessor.load(block_idx.x * block_dim.x + thread_idx.x); #else mt19937_octo_engine thread_engines[8]; -#pragma unroll - for (size_t i = 0; i < 8; ++i) + #pragma unroll + for(size_t i = 0; i < 8; ++i) { - thread_engines[i] = accessor.load(block_idx.x * block_dim.x + thread_idx.x + i); + thread_engines[i] = accessor.load(block_idx.x * block_dim.x + thread_idx.x + i); } #endif @@ -491,78 +498,76 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, // Start sequence: at least some engines have values, generated by the previous call for // the end sequence, but not yet used. + // clang-format off if(start_input > 0) { #if !defined(__HIP_DEVICE_COMPILE__) #pragma unroll for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) { - auto& input = inputs[warp_lane]; - auto& output = outputs[warp_lane]; - mt19937_octo_engine& engine = thread_engines[warp_lane]; - const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; + auto& input = inputs[warp_lane]; + auto& output = outputs[warp_lane]; + mt19937_octo_engine& engine = thread_engines[warp_lane]; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; #endif #pragma unroll - for(unsigned int j = 0; j < inputs_per_state; j++) - { - // Skip used values - if(j * stride + thread_id >= start_input) + for(unsigned int j = 0; j < inputs_per_state; j++) { -#pragma unroll - for(unsigned int i = 0; i < input_width; i++) + // Skip used values + if(j * stride + thread_id >= start_input) { - input[i] = mt19937_octo_engine::temper(engine.get(j * input_width + i)); - } +#pragma unroll + for(unsigned int i = 0; i < input_width; i++) + { + input[i] = mt19937_octo_engine::temper(engine.get(j * input_width + i)); + } - distribution(input, output); + distribution(input, output); - const size_t thread_index = j * stride + thread_id - start_input; - vec_data[thread_index] = *reinterpret_cast(output); + const size_t thread_index = j * stride + thread_id - start_input; + vec_data[thread_index] = *reinterpret_cast(output); + } } - } #if !defined(__HIP_DEVICE_COMPILE__) } #endif base_index = full_stride - start_input; } + // clang-format on // Middle sequence: all engines write n * stride values together and use them all // in a fast unrolled loop. + // clang-format off for(; base_index + full_stride <= vec_size; base_index += full_stride) { -// #if defined(__HIP_DEVICE_COMPILE__) -// engine.gen_next_n(); -// #else -// mt19937_octo_engine::gen_next_n(thread_engines); -// #endif - #if !defined(__HIP_DEVICE_COMPILE__) #pragma unroll for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) { - auto& input = inputs[warp_lane]; - auto& output = outputs[warp_lane]; - mt19937_octo_engine& engine = thread_engines[warp_lane]; - const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; + auto& input = inputs[warp_lane]; + auto& output = outputs[warp_lane]; + mt19937_octo_engine& engine = thread_engines[warp_lane]; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; #endif #pragma unroll - for(unsigned int j = 0; j < inputs_per_state; j++) - { -#pragma unroll - for(unsigned int i = 0; i < input_width; i++) + for(unsigned int j = 0; j < inputs_per_state; j++) { - input[i] = mt19937_octo_engine::temper(engine.get(j * input_width + i)); - } +#pragma unroll + for(unsigned int i = 0; i < input_width; i++) + { + input[i] = mt19937_octo_engine::temper(engine.get(j * input_width + i)); + } - distribution(input, output); + distribution(input, output); - const size_t thread_index = base_index + j * stride + thread_id; - vec_data[thread_index] = *reinterpret_cast(output); - } + const size_t thread_index = base_index + j * stride + thread_id; + vec_data[thread_index] = *reinterpret_cast(output); + } #if !defined(__HIP_DEVICE_COMPILE__) } #endif } + // clang-format on // Generate one extra VecT if data is not aligned by sizeof(VecT) or // size % output_width != 0 @@ -572,75 +577,77 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, if(base_index < vec_size + extra) { bool is_extra_thread = false; -// #if defined(__HIP_DEVICE_COMPILE__) -// engine.gen_next_n(); -// #else -// mt19937_octo_engine::gen_next_n(thread_engines); -// #endif + // #if defined(__HIP_DEVICE_COMPILE__) + // engine.gen_next_n(); + // #else + // mt19937_octo_engine::gen_next_n(thread_engines); + // #endif + // clang-format off #if !defined(__HIP_DEVICE_COMPILE__) #pragma unroll for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) { - auto& input = inputs[warp_lane]; - auto& output = outputs[warp_lane]; - mt19937_octo_engine& engine = thread_engines[warp_lane]; - const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; + auto& input = inputs[warp_lane]; + auto& output = outputs[warp_lane]; + mt19937_octo_engine& engine = thread_engines[warp_lane]; + const unsigned int thread_id = block_idx.x * block_size + thread_idx.x + warp_lane; #endif #pragma unroll - for(unsigned int j = 0; j < inputs_per_state; j++) - { -#pragma unroll - for(unsigned int i = 0; i < input_width; i++) + for(unsigned int j = 0; j < inputs_per_state; j++) { - input[i] = mt19937_octo_engine::temper(engine.get(j * input_width + i)); - } +#pragma unroll + for(unsigned int i = 0; i < input_width; i++) + { + input[i] = mt19937_octo_engine::temper(engine.get(j * input_width + i)); + } - distribution(input, output); + distribution(input, output); - const size_t thread_index = base_index + j * stride + thread_id; + const size_t thread_index = base_index + j * stride + thread_id; - // Mark an extra thread that will write head and tail - is_extra_thread = thread_index == vec_size + extra - 1; + // Mark an extra thread that will write head and tail + is_extra_thread = thread_index == vec_size + extra - 1; - if(thread_index < vec_size) - { - vec_data[thread_index] = *reinterpret_cast(output); - } - else - { - break; + if(thread_index < vec_size) + { + vec_data[thread_index] = *reinterpret_cast(output); + } + else + { + break; + } } - } - if constexpr(output_width > 1) - { - // Save head and tail, output was generated earlier - if(is_extra_thread) + if constexpr(output_width > 1) { - for(unsigned int o = 0; o < output_width; o++) + // Save head and tail, output was generated earlier + if(is_extra_thread) { - if(o < head_size) - { - data[o] = output[o]; - } - if(o > output_width - tail_size - 1) + for(unsigned int o = 0; o < output_width; o++) { - data[size + (output_width - tail_size - 1) - o] = output[o]; + if(o < head_size) + { + data[o] = output[o]; + } + if(o > output_width - tail_size - 1) + { + data[size + (output_width - tail_size - 1) - o] = output[o]; + } } } } - } #if !defined(__HIP_DEVICE_COMPILE__) } #endif + // clang-format on } // save state #if defined(__HIP_DEVICE_COMPILE__) accessor.save(thread_id, engine); #else - for (size_t i = 0; i < 8; ++i) + for(size_t i = 0; i < 8; ++i) { accessor.save(block_idx.x * block_dim.x + thread_idx.x + i, thread_engines[i]); } @@ -810,9 +817,9 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base } status = system_type::memcpy(d_mt19937_jump, - rocrand_h_mt19937_jump, - sizeof(rocrand_h_mt19937_jump), - hipMemcpyHostToDevice); + rocrand_h_mt19937_jump, + sizeof(rocrand_h_mt19937_jump), + hipMemcpyHostToDevice); if(status != ROCRAND_STATUS_SUCCESS) { system_type::free(d_engines); @@ -825,7 +832,8 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base [&, this](auto is_dynamic) { status = system_type::template launch< - rocrand_host::detail::jump_ahead_mt19937, + rocrand_host::detail:: + jump_ahead_mt19937, rocrand_host::detail::static_block_size_config_provider< jump_ahead_thread_count>>(dim3(m_generator_count), dim3(jump_ahead_thread_count), diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index 0d75ab784..9af6f87a9 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -136,7 +136,8 @@ struct mt19937_octo_engine static constexpr inline unsigned int i568 = 1 + items_per_thread * 10; /// Initialize the octo engine from the engine it shares with seven other threads. - __forceinline__ __device__ void gather(const unsigned int engine[mt19937_constants::n], dim3 thread_idx) + __forceinline__ __device__ void gather(const unsigned int engine[mt19937_constants::n], + dim3 thread_idx) { constexpr unsigned int off_cnt = 11; /// Used to map the \p mt19937_octo_state.mt indices to \p mt19937_state.mt indices. @@ -182,7 +183,7 @@ struct mt19937_octo_engine return __shfl_up(val, 1, 8); } /// Calculates value of index \p i using values i, (i + 1) % n, and (i + m) % n. - static __forceinline__ __device__ unsigned int + static __forceinline__ __device__ __host__ unsigned int comp(unsigned int mt_i, unsigned int mt_i_1, unsigned int mt_i_m) { const unsigned int y @@ -219,19 +220,19 @@ struct mt19937_octo_engine m_state.mt[idx_i + j] = comp(m_state.mt[idx_i + j], last_dep, m_state.mt[idx_m + j]); } - static void comp_vector(unsigned int idx_i, - unsigned int idx_m, - unsigned int last_dep_tid_7, + static void comp_vector(unsigned int idx_i, + unsigned int idx_m, + unsigned int last_dep_tid_7, mt19937_octo_engine thread_engines[8]) { // communicate the dependency for the last value unsigned int last_deps[8]; - for (int i = 0; i < 8; ++i) + for(int i = 0; i < 8; ++i) { last_deps[i] = thread_engines[(i + 1) % 8].m_state.mt[idx_i]; } - for (int i = 0; i < 8; ++i) + for(int i = 0; i < 8; ++i) { // thread 7 needs a special value that does not fit the pattern unsigned int last_dep = i == 7 ? last_dep_tid_7 : last_deps[i]; @@ -241,10 +242,14 @@ struct mt19937_octo_engine { // compute (i + ipt * i + j)': needs (i + ipt * i + 1 + j) % n and (i + ipt * i + m + j) % n thread_engines[i].m_state.mt[idx_i + j] - = comp(thread_engines[i].m_state.mt[idx_i + j], thread_engines[i].m_state.mt[idx_i + j + 1], thread_engines[i].m_state.mt[idx_m + j]); + = comp(thread_engines[i].m_state.mt[idx_i + j], + thread_engines[i].m_state.mt[idx_i + j + 1], + thread_engines[i].m_state.mt[idx_m + j]); } // compute the last value using the communicated dependency - thread_engines[i].m_state.mt[idx_i + j] = comp(thread_engines[i].m_state.mt[idx_i + j], last_dep, thread_engines[i].m_state.mt[idx_m + j]); + thread_engines[i].m_state.mt[idx_i + j] = comp(thread_engines[i].m_state.mt[idx_i + j], + last_dep, + thread_engines[i].m_state.mt[idx_m + j]); } } @@ -404,7 +409,8 @@ struct mt19937_octo_engine // compute 0': needs 1 and 397 const unsigned int v397 = thread_engines[5].m_state.mt[i397_5]; - thread_engines[0].m_state.mt[i000_0] = comp(thread_engines[0].m_state.mt[i000_0], thread_engines[0].m_state.mt[i001], v397); + thread_engines[0].m_state.mt[i000_0] + = comp(thread_engines[0].m_state.mt[i000_0], thread_engines[0].m_state.mt[i001], v397); // compute [ 1 + i * ipt, 1 + ipt * (i + 1))' = [ 1, 56]': // needs [ 1, 57] and [398, 453] @@ -419,7 +425,8 @@ struct mt19937_octo_engine // compute 113': needs 114 and 510 const unsigned int v114 = thread_engines[0].m_state.mt[i114]; const unsigned int v510 = thread_engines[6].m_state.mt[i510_6]; - thread_engines[1].m_state.mt[i113_1] = comp(thread_engines[1].m_state.mt[i113_1], v114, v510); + thread_engines[1].m_state.mt[i113_1] + = comp(thread_engines[1].m_state.mt[i113_1], v114, v510); // compute [114 + i * ipt, 114 + ipt * (i + 1))' = [114, 169]': // needs [114, 170] and [511, 566] @@ -429,7 +436,8 @@ struct mt19937_octo_engine // compute 170': needs 171 and 567 const unsigned int v171 = thread_engines[0].m_state.mt[i171]; const unsigned int v567 = thread_engines[7].m_state.mt[i567_7]; - thread_engines[2].m_state.mt[i170_2] = comp(thread_engines[2].m_state.mt[i170_2], v171, v567); + thread_engines[2].m_state.mt[i170_2] + = comp(thread_engines[2].m_state.mt[i170_2], v171, v567); // compute [171 + i * ipt, 171 + ipt * (i + 1))' = [171, 226]': // needs [171, 227] and [568, 623] @@ -445,35 +453,43 @@ struct mt19937_octo_engine unsigned int last_deps[8]; unsigned int first_deps[8]; - for (int i = 0; i < 8; ++i) + for(int i = 0; i < 8; ++i) { - last_deps[i] = thread_engines[(i + 1) % 8].m_state.mt[i227]; + last_deps[i] = thread_engines[(i + 1) % 8].m_state.mt[i227]; first_deps[i] = thread_engines[(i - 1) % 8].m_state.mt[i001 + items_per_thread - 1]; } - for (int i = 0; i < 8; ++i) + for(int i = 0; i < 8; ++i) { // communicate the dependency for the first and last value - unsigned int last_dep = i == 7 ? v283 : last_dep; - unsigned int first_dep = i == 0 ? thread_engines[0].m_state.mt[i000_0] : first_dep; + unsigned int last_dep = i == 7 ? v283 : last_deps[i]; + unsigned int first_dep = i == 0 ? thread_engines[0].m_state.mt[i000_0] : first_deps[i]; // extract the first and last iterations from the loop - unsigned int j = 0; - thread_engines[i].m_state.mt[i227 + j] = comp(thread_engines[i].m_state.mt[i227 + j], thread_engines[i].m_state.mt[i227 + j + 1], first_dep); + unsigned int j = 0; + thread_engines[i].m_state.mt[i227 + j] + = comp(thread_engines[i].m_state.mt[i227 + j], + thread_engines[i].m_state.mt[i227 + j + 1], + first_dep); for(j = 1; j < items_per_thread - 1; j++) { - thread_engines[i].m_state.mt[i227 + j] = comp(thread_engines[i].m_state.mt[i227 + j], - thread_engines[i].m_state.mt[i227 + j + 1], - thread_engines[i].m_state.mt[i001 + j - 1]); + thread_engines[i].m_state.mt[i227 + j] + = comp(thread_engines[i].m_state.mt[i227 + j], + thread_engines[i].m_state.mt[i227 + j + 1], + thread_engines[i].m_state.mt[i001 + j - 1]); } - thread_engines[i].m_state.mt[i227 + j] = comp(thread_engines[i].m_state.mt[i227 + j], last_dep, thread_engines[i].m_state.mt[i001 + j - 1]); + thread_engines[i].m_state.mt[i227 + j] + = comp(thread_engines[i].m_state.mt[i227 + j], + last_dep, + thread_engines[i].m_state.mt[i001 + j - 1]); } } // compute 283': needs 284 and 56' const unsigned int v284 = thread_engines[0].m_state.mt[i284]; const unsigned int v056 = thread_engines[7].m_state.mt[i001 + 6]; // 1 + 7 * 7 + 6 = 56 - thread_engines[3].m_state.mt[i283_3] = comp(thread_engines[3].m_state.mt[i283_3], v284, v056); + thread_engines[3].m_state.mt[i283_3] + = comp(thread_engines[3].m_state.mt[i283_3], v284, v056); // compute [284 + i * ipt, 284 + ipt * (i + 1))' = [284, 339]': // needs [284, 340] and [ 57, 112]' @@ -483,7 +499,8 @@ struct mt19937_octo_engine // compute 340': needs 341 and 113' const unsigned int v113_ = thread_engines[1].m_state.mt[i113_1]; const unsigned int v341 = thread_engines[0].m_state.mt[i341]; - thread_engines[4].m_state.mt[i340_4] = comp(thread_engines[4].m_state.mt[i340_4], v341, v113_); + thread_engines[4].m_state.mt[i340_4] + = comp(thread_engines[4].m_state.mt[i340_4], v341, v113_); // compute [341 + i * ipt, 341 + ipt * (i + 1))' = [341, 396]': // needs [341, 397] and [114, 169]' @@ -493,7 +510,8 @@ struct mt19937_octo_engine // compute 397': needs 398 and 170' const unsigned int v398 = thread_engines[0].m_state.mt[i398]; const unsigned int v170_ = thread_engines[2].m_state.mt[i170_2]; - thread_engines[5].m_state.mt[i397_5] = comp(thread_engines[5].m_state.mt[i397_5], v398, v170_); + thread_engines[5].m_state.mt[i397_5] + = comp(thread_engines[5].m_state.mt[i397_5], v398, v170_); // compute [398 + i * ipt, 398 + ipt * (i + 1))' = [398, 453]': // needs [398, 454] and [171, 226]' @@ -508,7 +526,8 @@ struct mt19937_octo_engine // compute 510': needs 511 and 283' const unsigned int v511 = thread_engines[0].m_state.mt[i511]; const unsigned int v283_ = thread_engines[3].m_state.mt[i283_3]; - thread_engines[6].m_state.mt[i510_6] = comp(thread_engines[6].m_state.mt[i510_6], v511, v283_); + thread_engines[6].m_state.mt[i510_6] + = comp(thread_engines[6].m_state.mt[i510_6], v511, v283_); // compute [511 + i * ipt, 511 + ipt * (i + 1))' = [511, 566]': // needs [511, 567] and [284, 339]' @@ -518,7 +537,8 @@ struct mt19937_octo_engine // compute 567': needs 568 and 340' const unsigned int v568 = thread_engines[0].m_state.mt[i568]; const unsigned int i340 = thread_engines[4].m_state.mt[i340_4]; - thread_engines[7].m_state.mt[i567_7] = comp(thread_engines[7].m_state.mt[i567_7], v568, i340); + thread_engines[7].m_state.mt[i567_7] + = comp(thread_engines[7].m_state.mt[i567_7], v568, i340); // compute [568 + i * ipt, 568 + ipt * (i + 1))' = [568, 623]': // needs [568, 623], [0, 0]', and [341, 396]' diff --git a/library/src/rng/system.hpp b/library/src/rng/system.hpp index 40090f009..34e74a9cc 100644 --- a/library/src/rng/system.hpp +++ b/library/src/rng/system.hpp @@ -77,7 +77,7 @@ struct host_system delete[] ptr; } - static rocrand_status memcpy(void * dst, const void * src, size_t size, hipMemcpyKind /*kind*/) + static rocrand_status memcpy(void* dst, const void* src, size_t size, hipMemcpyKind /*kind*/) { std::memcpy(dst, src, size); return ROCRAND_STATUS_SUCCESS; @@ -159,10 +159,10 @@ struct host_system if constexpr(UseHostFunc) { - hipError_t status = hipLaunchHostFunc(stream, kernel_callback, kernel_args); - hipStreamSynchronize(stream); + hipError_t status = hipLaunchHostFunc(stream, kernel_callback, kernel_args); + hipError_t sync_status = hipStreamSynchronize(stream); - if(status != hipSuccess) + if(status != hipSuccess || sync_status != hipSuccess) { // At this point, if the callback has not been invoked, there will be a memory // leak. It is unclear whether hipLaunchHostFunc can return an error after the @@ -216,7 +216,7 @@ struct device_system ROCRAND_HIP_FATAL_ASSERT(hipFree(ptr)); } - static rocrand_status memcpy(void * dst, const void * src, size_t size, hipMemcpyKind kind) + static rocrand_status memcpy(void* dst, const void* src, size_t size, hipMemcpyKind kind) { hipError_t error = hipMemcpy(dst, src, size, kind); if(error != hipSuccess) diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index 9ed14886a..f21264ac2 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -739,15 +739,22 @@ TYPED_TEST(mt19937_generator_engine_tests, subsequence_test) using ConfigProvider = default_config_provider; rocrand_status status = rocrand_system_device::template launch< - rocrand_host::detail::jump_ahead_mt19937, - rocrand_host::detail::static_block_size_config_provider>( - dim3(generator_count), - dim3(generator_t::jump_ahead_thread_count), - 0, - 0, - d_engines, - seed, - d_mt19937_jump); + rocrand_host::detail:: + jump_ahead_mt19937, + rocrand_host::detail::static_block_size_config_provider< + generator_t::jump_ahead_thread_count>>(dim3(generator_count), + dim3(generator_t::jump_ahead_thread_count), + 0, + 0, + d_engines, + seed, + d_mt19937_jump); + if(status != ROCRAND_STATUS_SUCCESS) + { + std::cout << "rocRAND error code: " << status << " while calling: jump_ahead_mt19937" + << std::endl; + exit(status); + } octo_engine_type* d_octo_engines{}; HIP_CHECK(hipMalloc(&d_octo_engines, @@ -1153,8 +1160,11 @@ TYPED_TEST(mt19937_generator_engine_tests, jump_ahead_test) [&](auto is_dynamic) { rocrand_status status = rocrand_system_device::template launch< - rocrand_host::detail::jump_ahead_mt19937, - rocrand_host::detail::static_block_size_config_provider>( + rocrand_host::detail::jump_ahead_mt19937, + rocrand_host::detail::static_block_size_config_provider< + generator_t::jump_ahead_thread_count>>( dim3(generator_count), dim3(generator_t::jump_ahead_thread_count), 0, @@ -1162,6 +1172,12 @@ TYPED_TEST(mt19937_generator_engine_tests, jump_ahead_test) d_engines1, seed, d_mt19937_jump); + if(status != ROCRAND_STATUS_SUCCESS) + { + std::cout << "rocRAND error code: " << status << " while calling: jump_ahead_mt19937" + << std::endl; + exit(status); + } }); std::vector h_engines1(generator_count * n); From f6c4ba97076a1341c188dd0ce38b857a4de7e054 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Mon, 8 Apr 2024 11:34:19 +0000 Subject: [PATCH 23/54] fix missing gen_next_n calls fix clang-format issues --- library/src/rng/mt19937.hpp | 31 +++++++++++++-------- library/src/rng/mt19937_octo_engine.hpp | 5 ++-- test/internal/test_rocrand_mt19937_prng.cpp | 4 +-- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index e18aa2499..ad186beff 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -344,13 +344,15 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, const unsigned int thread_id = block_idx.x * block_size + thread_idx.x; #endif + // clang-format off #if defined(__HIP_DEVICE_COMPILE__) unsigned int input[input_width]; T output[output_width]; #else - unsigned int inputs[8][input_width]; - T outputs[8][output_width]; + unsigned int inputs[8][input_width]; + T outputs[8][output_width]; #endif + // clang-format on // Generate one extra VecT if data is not aligned by sizeof(VecT) or // size % output_width != 0 @@ -469,30 +471,34 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, constexpr unsigned int stride = block_size * grid_size; constexpr unsigned int full_stride = stride * inputs_per_state; + // clang-format off #if defined(__HIP_DEVICE_COMPILE__) const unsigned int thread_id = block_idx.x * block_size + thread_idx.x; unsigned int input[input_width]; T output[output_width]; #else - unsigned int inputs[8][input_width]; - T outputs[8][output_width]; + unsigned int inputs[8][input_width]; + T outputs[8][output_width]; #endif + // clang-format on // Workaround: since load() and store() use the same indices, the compiler decides to keep // computed addresses alive wasting 78 * 2 VGPRs. block_dim.x equals to block_size but it is // a runtime value so save() will compute new addresses. mt19937_octo_engine_accessor accessor(engines); + // clang-format off #if defined(__HIP_DEVICE_COMPILE__) mt19937_octo_engine engine = accessor.load(block_idx.x * block_dim.x + thread_idx.x); #else mt19937_octo_engine thread_engines[8]; - #pragma unroll +#pragma unroll for(size_t i = 0; i < 8; ++i) { thread_engines[i] = accessor.load(block_idx.x * block_dim.x + thread_idx.x + i); } #endif + // clang-format on size_t base_index = 0; @@ -540,7 +546,10 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, // clang-format off for(; base_index + full_stride <= vec_size; base_index += full_stride) { -#if !defined(__HIP_DEVICE_COMPILE__) +#if defined(__HIP_DEVICE_COMPILE__) + engine.gen_next_n(); +#else + mt19937_octo_engine::gen_next_n(thread_engines); #pragma unroll for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) { @@ -577,11 +586,11 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, if(base_index < vec_size + extra) { bool is_extra_thread = false; - // #if defined(__HIP_DEVICE_COMPILE__) - // engine.gen_next_n(); - // #else - // mt19937_octo_engine::gen_next_n(thread_engines); - // #endif +#if defined(__HIP_DEVICE_COMPILE__) + engine.gen_next_n(); +#else + mt19937_octo_engine::gen_next_n(thread_engines); +#endif // clang-format off #if !defined(__HIP_DEVICE_COMPILE__) diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index 9af6f87a9..31bdfbcbb 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -462,8 +462,9 @@ struct mt19937_octo_engine for(int i = 0; i < 8; ++i) { // communicate the dependency for the first and last value - unsigned int last_dep = i == 7 ? v283 : last_deps[i]; - unsigned int first_dep = i == 0 ? thread_engines[0].m_state.mt[i000_0] : first_deps[i]; + unsigned int last_dep = i == 7 ? v283 : last_deps[i]; + unsigned int first_dep + = i == 0 ? thread_engines[0].m_state.mt[i000_0] : first_deps[i]; // extract the first and last iterations from the loop unsigned int j = 0; diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index f21264ac2..dff7c2384 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -752,7 +752,7 @@ TYPED_TEST(mt19937_generator_engine_tests, subsequence_test) if(status != ROCRAND_STATUS_SUCCESS) { std::cout << "rocRAND error code: " << status << " while calling: jump_ahead_mt19937" - << std::endl; + << std::endl; exit(status); } @@ -1175,7 +1175,7 @@ TYPED_TEST(mt19937_generator_engine_tests, jump_ahead_test) if(status != ROCRAND_STATUS_SUCCESS) { std::cout << "rocRAND error code: " << status << " while calling: jump_ahead_mt19937" - << std::endl; + << std::endl; exit(status); } }); From d922943c36cfe4c4ed17231196934f09b6353a1e Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 11 Apr 2024 07:32:56 +0000 Subject: [PATCH 24/54] fix format issues and missing __host__s --- library/src/rng/mt19937.hpp | 4 +++- library/src/rng/mt19937_octo_engine.hpp | 14 +++++++------- test/internal/test_rocrand_mt19937_prng.cpp | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index ad186beff..95f105958 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -586,13 +586,13 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, if(base_index < vec_size + extra) { bool is_extra_thread = false; + // clang-format off #if defined(__HIP_DEVICE_COMPILE__) engine.gen_next_n(); #else mt19937_octo_engine::gen_next_n(thread_engines); #endif - // clang-format off #if !defined(__HIP_DEVICE_COMPILE__) #pragma unroll for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) @@ -653,6 +653,7 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, } // save state + // clang-format off #if defined(__HIP_DEVICE_COMPILE__) accessor.save(thread_id, engine); #else @@ -661,6 +662,7 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, accessor.save(block_idx.x * block_dim.x + thread_idx.x + i, thread_engines[i]); } #endif + // clang-format on } } // end namespace rocrand_host::detail diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index 31bdfbcbb..74a5c8ede 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -136,7 +136,7 @@ struct mt19937_octo_engine static constexpr inline unsigned int i568 = 1 + items_per_thread * 10; /// Initialize the octo engine from the engine it shares with seven other threads. - __forceinline__ __device__ void gather(const unsigned int engine[mt19937_constants::n], + __forceinline__ __device__ __host__ void gather(const unsigned int engine[mt19937_constants::n], dim3 thread_idx) { constexpr unsigned int off_cnt = 11; @@ -548,13 +548,13 @@ struct mt19937_octo_engine } /// Return \p i state value without tempering - __forceinline__ __device__ unsigned int get(unsigned int i) const + __forceinline__ __device__ __host__ unsigned int get(unsigned int i) const { return m_state.mt[i]; } /// Perform tempering on y - static __forceinline__ __device__ unsigned int temper(unsigned int y) + static __forceinline__ __device__ __host__ unsigned int temper(unsigned int y) { constexpr unsigned int TEMPERING_MASK_B = 0x9D2C5680U; constexpr unsigned int TEMPERING_MASK_C = 0xEFC60000U; @@ -574,19 +574,19 @@ struct mt19937_octo_engine template struct mt19937_octo_engine_accessor { - __forceinline__ __device__ explicit mt19937_octo_engine_accessor(unsigned int* _engines) + __forceinline__ __device__ __host__ explicit mt19937_octo_engine_accessor(unsigned int* _engines) : engines(_engines) {} /// Load one value \p i of the octo engine \p engine_id from global memory with coalesced /// access - __forceinline__ __device__ unsigned int load_value(unsigned int engine_id, unsigned int i) const + __forceinline__ __device__ __host__ unsigned int load_value(unsigned int engine_id, unsigned int i) const { return engines[i * stride + engine_id]; } /// Load the octo engine from global memory with coalesced access - __forceinline__ __device__ mt19937_octo_engine load(unsigned int engine_id) const + __forceinline__ __device__ __host__ mt19937_octo_engine load(unsigned int engine_id) const { mt19937_octo_engine engine; #pragma unroll @@ -598,7 +598,7 @@ struct mt19937_octo_engine_accessor } /// Save the octo engine to global memory with coalesced access - __forceinline__ __device__ void save(unsigned int engine_id, + __forceinline__ __device__ __host__ void save(unsigned int engine_id, const mt19937_octo_engine& engine) const { #pragma unroll diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index dff7c2384..010bdb110 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -1174,8 +1174,8 @@ TYPED_TEST(mt19937_generator_engine_tests, jump_ahead_test) d_mt19937_jump); if(status != ROCRAND_STATUS_SUCCESS) { - std::cout << "rocRAND error code: " << status << " while calling: jump_ahead_mt19937" - << std::endl; + std::cout << "rocRAND error code: " << status + << " while calling: jump_ahead_mt19937" << std::endl; exit(status); } }); From 961123f7c4e0d8ed1cea2f8eb03dd3b8202bfeef Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Mon, 15 Apr 2024 12:55:40 +0000 Subject: [PATCH 25/54] fix messed up host/device allocations --- library/src/rng/mt19937.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 95f105958..6a145fdbb 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -1087,7 +1087,7 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base unsigned long long m_seed; // For caching of Poisson for consecutive generations with the same lambda - poisson_distribution_manager<> m_poisson; + poisson_distribution_manager::value> m_poisson; /// Number of independent generators. Value changes generated number stream. unsigned int m_generator_count = 0; From f718fe98d463f536833806a0909f1767727113bf Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 18 Apr 2024 06:40:36 +0000 Subject: [PATCH 26/54] fix merge conflicts fix format --- library/src/rng/mt19937.hpp | 37 +++++++++++---------- library/src/rng/mt19937_octo_engine.hpp | 10 +++--- library/src/rocrand.cpp | 2 +- test/internal/test_rocrand_mt19937_prng.cpp | 14 ++++---- 4 files changed, 33 insertions(+), 30 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 6a145fdbb..0b6404450 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -665,13 +665,11 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, // clang-format on } -} // end namespace rocrand_host::detail - template -class rocrand_mt19937_template : public rocrand_generator_impl_base +class mt19937_generator_template : public generator_impl_base { public: - using base_type = rocrand_generator_impl_base; + using base_type = generator_impl_base; using octo_engine_type = mt19937_octo_engine; using system_type = System; @@ -804,7 +802,7 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base // Allocate device random number engines rocrand_status status = system_type::alloc( &m_engines, - m_generator_count * rocrand_host::detail::mt19937_constants::n * sizeof(unsigned int)); + m_generator_count * mt19937_constants::n * sizeof(unsigned int)); if(status != ROCRAND_STATUS_SUCCESS) { return status; @@ -812,7 +810,7 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base unsigned int* d_engines{}; status = system_type::alloc(&d_engines, - m_generator_count * rocrand_host::detail::mt19937_constants::n + m_generator_count * mt19937_constants::n * sizeof(unsigned int)); if(status != ROCRAND_STATUS_SUCCESS) { @@ -843,9 +841,9 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base [&, this](auto is_dynamic) { status = system_type::template launch< - rocrand_host::detail:: + jump_ahead_mt19937, - rocrand_host::detail::static_block_size_config_provider< + static_block_size_config_provider< jump_ahead_thread_count>>(dim3(m_generator_count), dim3(jump_ahead_thread_count), 0, @@ -864,12 +862,12 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base system_type::free(d_mt19937_jump); // This kernel is not actually tuned for ordering, but config is needed for device-side compile time check of the generator count - rocrand_host::detail::dynamic_dispatch( + dynamic_dispatch( m_order, [&, this](auto is_dynamic) { status = system_type::template launch< - rocrand_host::detail::init_engines_mt19937>( + init_engines_mt19937>( dim3(config.blocks), dim3(config.threads), 0, @@ -915,7 +913,7 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base return ROCRAND_STATUS_INTERNAL_ERROR; } - using vec_type = aligned_vec_type; + using vec_type = rocrand_impl::aligned_vec_type; const uintptr_t uintptr = reinterpret_cast(data); const size_t misalignment @@ -960,7 +958,7 @@ class rocrand_mt19937_template : public rocrand_generator_impl_base [&, this](auto is_dynamic) { status = system_type::template launch< - rocrand_host::detail::generate_short_mt19937::value> m_poisson; + poisson_distribution_manager::value> + m_poisson; /// Number of independent generators. Value changes generated number stream. unsigned int m_generator_count = 0; }; -using rocrand_mt19937 = rocrand_mt19937_template< - rocrand_system_device, +using mt19937_generator = mt19937_generator_template< + rocrand_impl::system::device_system, default_config_provider>; -using rocrand_mt19937_host = rocrand_mt19937_template< - rocrand_system_host, +template +using mt19937_generator_host = mt19937_generator_template< + rocrand_impl::system::host_system, default_config_provider>; } // namespace rocrand_impl::host diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index 74a5c8ede..cd32eb088 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -137,7 +137,7 @@ struct mt19937_octo_engine /// Initialize the octo engine from the engine it shares with seven other threads. __forceinline__ __device__ __host__ void gather(const unsigned int engine[mt19937_constants::n], - dim3 thread_idx) + dim3 thread_idx) { constexpr unsigned int off_cnt = 11; /// Used to map the \p mt19937_octo_state.mt indices to \p mt19937_state.mt indices. @@ -574,13 +574,15 @@ struct mt19937_octo_engine template struct mt19937_octo_engine_accessor { - __forceinline__ __device__ __host__ explicit mt19937_octo_engine_accessor(unsigned int* _engines) + __forceinline__ + __device__ __host__ explicit mt19937_octo_engine_accessor(unsigned int* _engines) : engines(_engines) {} /// Load one value \p i of the octo engine \p engine_id from global memory with coalesced /// access - __forceinline__ __device__ __host__ unsigned int load_value(unsigned int engine_id, unsigned int i) const + __forceinline__ __device__ __host__ unsigned int load_value(unsigned int engine_id, + unsigned int i) const { return engines[i * stride + engine_id]; } @@ -599,7 +601,7 @@ struct mt19937_octo_engine_accessor /// Save the octo engine to global memory with coalesced access __forceinline__ __device__ __host__ void save(unsigned int engine_id, - const mt19937_octo_engine& engine) const + const mt19937_octo_engine& engine) const { #pragma unroll for(unsigned int i = 0; i < mt19937_constants::n / threads_per_generator; i++) diff --git a/library/src/rocrand.cpp b/library/src/rocrand.cpp index 81e77d506..7c29f5a29 100644 --- a/library/src/rocrand.cpp +++ b/library/src/rocrand.cpp @@ -80,7 +80,7 @@ rocrand_status create_generator_host(rocrand_generator* generator, rocrand_rng_t *generator = new generator_type>(); break; case ROCRAND_RNG_PSEUDO_MT19937: - *generator = new rocrand_generator_type>(); + *generator = new generator_type>(); break; default: return ROCRAND_STATUS_TYPE_ERROR; diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index 010bdb110..dd364fdcc 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -738,10 +738,10 @@ TYPED_TEST(mt19937_generator_engine_tests, subsequence_test) // dummy config provider, kernel just needs to verify the amount of generators for the actual call using ConfigProvider = default_config_provider; - rocrand_status status = rocrand_system_device::template launch< - rocrand_host::detail:: + rocrand_status status = rocrand_impl::system::device_system::template launch< + rocrand_impl::host:: jump_ahead_mt19937, - rocrand_host::detail::static_block_size_config_provider< + rocrand_impl::host::static_block_size_config_provider< generator_t::jump_ahead_thread_count>>(dim3(generator_count), dim3(generator_t::jump_ahead_thread_count), 0, @@ -1155,15 +1155,15 @@ TYPED_TEST(mt19937_generator_engine_tests, jump_ahead_test) unsigned int* d_engines1{}; HIP_CHECK(hipMalloc(&d_engines1, generator_count * n * sizeof(unsigned int))); - rocrand_host::detail::dynamic_dispatch( + rocrand_impl::host::dynamic_dispatch( ROCRAND_ORDERING_PSEUDO_DEFAULT, [&](auto is_dynamic) { - rocrand_status status = rocrand_system_device::template launch< - rocrand_host::detail::jump_ahead_mt19937, - rocrand_host::detail::static_block_size_config_provider< + rocrand_impl::host::static_block_size_config_provider< generator_t::jump_ahead_thread_count>>( dim3(generator_count), dim3(generator_t::jump_ahead_thread_count), From 1188c63f7e47754a318c3f8030351d27b306c49c Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Fri, 19 Apr 2024 12:47:23 +0000 Subject: [PATCH 27/54] fix format issues and compile error --- benchmark/tuning/benchmarked_generators.hpp | 3 +- library/src/rng/mt19937.hpp | 91 ++++++++++----------- test/internal/test_rocrand_mt19937_prng.cpp | 4 +- 3 files changed, 48 insertions(+), 50 deletions(-) diff --git a/benchmark/tuning/benchmarked_generators.hpp b/benchmark/tuning/benchmarked_generators.hpp index f50133d0f..69b70c0a5 100644 --- a/benchmark/tuning/benchmarked_generators.hpp +++ b/benchmark/tuning/benchmarked_generators.hpp @@ -107,7 +107,8 @@ using mtgp32_generator_template ConfigProvider>; template -using mt19937_generator_template = rocrand_impl::host::mt19937_generator_template; +using mt19937_generator_template + = rocrand_impl::host::mt19937_generator_template; template using philox4x32_10_generator_template diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 0b6404450..e529b46b2 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -800,18 +800,18 @@ class mt19937_generator_template : public generator_impl_base system_type::free(m_engines); } // Allocate device random number engines - rocrand_status status = system_type::alloc( - &m_engines, - m_generator_count * mt19937_constants::n * sizeof(unsigned int)); + rocrand_status status + = system_type::alloc(&m_engines, + m_generator_count * mt19937_constants::n * sizeof(unsigned int)); if(status != ROCRAND_STATUS_SUCCESS) { return status; } unsigned int* d_engines{}; - status = system_type::alloc(&d_engines, - m_generator_count * mt19937_constants::n - * sizeof(unsigned int)); + status + = system_type::alloc(&d_engines, + m_generator_count * mt19937_constants::n * sizeof(unsigned int)); if(status != ROCRAND_STATUS_SUCCESS) { return status; @@ -841,16 +841,16 @@ class mt19937_generator_template : public generator_impl_base [&, this](auto is_dynamic) { status = system_type::template launch< - - jump_ahead_mt19937, - static_block_size_config_provider< - jump_ahead_thread_count>>(dim3(m_generator_count), - dim3(jump_ahead_thread_count), - 0, - m_stream, - d_engines, - m_seed, - d_mt19937_jump); + + jump_ahead_mt19937, + static_block_size_config_provider>( + dim3(m_generator_count), + dim3(jump_ahead_thread_count), + 0, + m_stream, + d_engines, + m_seed, + d_mt19937_jump); }); if(status != ROCRAND_STATUS_SUCCESS) { @@ -862,19 +862,18 @@ class mt19937_generator_template : public generator_impl_base system_type::free(d_mt19937_jump); // This kernel is not actually tuned for ordering, but config is needed for device-side compile time check of the generator count - dynamic_dispatch( - m_order, - [&, this](auto is_dynamic) - { - status = system_type::template launch< - init_engines_mt19937>( - dim3(config.blocks), - dim3(config.threads), - 0, - m_stream, - m_engines, - d_engines); - }); + dynamic_dispatch(m_order, + [&, this](auto is_dynamic) + { + status = system_type::template launch< + init_engines_mt19937>( + dim3(config.blocks), + dim3(config.threads), + 0, + m_stream, + m_engines, + d_engines); + }); if(status != ROCRAND_STATUS_SUCCESS) { system_type::free(d_engines); @@ -957,12 +956,11 @@ class mt19937_generator_template : public generator_impl_base m_order, [&, this](auto is_dynamic) { - status = system_type::template launch< - generate_short_mt19937>( + status = system_type::template launch>( dim3(config.blocks), dim3(config.threads), 0, @@ -989,12 +987,11 @@ class mt19937_generator_template : public generator_impl_base m_order, [&, this](auto is_dynamic) { - status = system_type::template launch< - generate_long_mt19937>( + status = system_type::template launch>( dim3(config.blocks), dim3(config.threads), 0, @@ -1093,13 +1090,13 @@ class mt19937_generator_template : public generator_impl_base unsigned int m_generator_count = 0; }; -using mt19937_generator = mt19937_generator_template< - rocrand_impl::system::device_system, - default_config_provider>; +using mt19937_generator + = mt19937_generator_template>; template -using mt19937_generator_host = mt19937_generator_template< - rocrand_impl::system::host_system, - default_config_provider>; +using mt19937_generator_host + = mt19937_generator_template, + default_config_provider>; } // namespace rocrand_impl::host diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index dd364fdcc..c3e3d60a4 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -1161,8 +1161,8 @@ TYPED_TEST(mt19937_generator_engine_tests, jump_ahead_test) { rocrand_status status = rocrand_impl::system::device_system::template launch< rocrand_impl::host::jump_ahead_mt19937, + ConfigProvider, + is_dynamic>, rocrand_impl::host::static_block_size_config_provider< generator_t::jump_ahead_thread_count>>( dim3(generator_count), From 1abe891bc2a9914c30902045b3d420a7db9603ba Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 25 Apr 2024 11:30:32 +0000 Subject: [PATCH 28/54] fix format issues --- benchmark/tuning/benchmarked_generators.hpp | 3 ++- library/src/rng/mt19937.hpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmark/tuning/benchmarked_generators.hpp b/benchmark/tuning/benchmarked_generators.hpp index 69b70c0a5..855fcba46 100644 --- a/benchmark/tuning/benchmarked_generators.hpp +++ b/benchmark/tuning/benchmarked_generators.hpp @@ -108,7 +108,8 @@ using mtgp32_generator_template template using mt19937_generator_template - = rocrand_impl::host::mt19937_generator_template; + = rocrand_impl::host::mt19937_generator_template; template using philox4x32_10_generator_template diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index e529b46b2..300b8aaff 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -326,7 +326,7 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, #if !defined(__HIP_DEVICE_COMPILE__) if(thread_idx.x % 8 != 0) { - return; + return; } #endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); @@ -453,7 +453,7 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, #if !defined(__HIP_DEVICE_COMPILE__) if(thread_idx.x % 8 != 0) { - return; + return; } #endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); From 2f771bb4044296c2da3855d20c9ec797afc5d330 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Mon, 29 Apr 2024 10:41:45 +0000 Subject: [PATCH 29/54] fix format issue --- library/src/rng/mt19937.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 300b8aaff..f14834d76 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -56,8 +56,8 @@ #include "distributions.hpp" #include "generator_type.hpp" #include "mt19937_octo_engine.hpp" -#include "utils/cpp_utils.hpp" #include "system.hpp" +#include "utils/cpp_utils.hpp" #include "config/config_defaults.hpp" #include "config_types.hpp" @@ -326,7 +326,7 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, #if !defined(__HIP_DEVICE_COMPILE__) if(thread_idx.x % 8 != 0) { - return; + return; } #endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); @@ -453,7 +453,7 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, #if !defined(__HIP_DEVICE_COMPILE__) if(thread_idx.x % 8 != 0) { - return; + return; } #endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); From bb0fe50ec1833770e5909e29a9c6933d01b51b1f Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Fri, 3 May 2024 06:58:47 +0000 Subject: [PATCH 30/54] disable most mt19937 host tests for normal run (enabled for slow test run) --- library/src/rng/mt19937.hpp | 4 ++-- test/test_rocrand_host.cpp | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index f14834d76..94caaa12b 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -326,7 +326,7 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, #if !defined(__HIP_DEVICE_COMPILE__) if(thread_idx.x % 8 != 0) { - return; + return; } #endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); @@ -453,7 +453,7 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, #if !defined(__HIP_DEVICE_COMPILE__) if(thread_idx.x % 8 != 0) { - return; + return; } #endif constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); diff --git a/test/test_rocrand_host.cpp b/test/test_rocrand_host.cpp index bc5c53751..cb7d810a7 100644 --- a/test/test_rocrand_host.cpp +++ b/test/test_rocrand_host.cpp @@ -72,6 +72,11 @@ class rocrand_generate_host_test : public ::testing::TestWithParam& seeds = get_seeds()) { + if(rng_type == ROCRAND_RNG_PSEUDO_MT19937) + { + ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED(); + } + rocrand_generator device_generator, host_generator; ROCRAND_CHECK(rocrand_create_generator(&device_generator, rng_type)); ROCRAND_CHECK(rocrand_create_generator_host(&host_generator, rng_type)); @@ -171,6 +181,11 @@ void test_uniform_parity(rocrand_rng_type rng_type, F generate, const std::vector& seeds = get_seeds()) { + if(rng_type == ROCRAND_RNG_PSEUDO_MT19937) + { + ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED(); + } + rocrand_generator device_generator, host_generator; ROCRAND_CHECK(rocrand_create_generator(&device_generator, rng_type)); ROCRAND_CHECK(rocrand_create_generator_host(&host_generator, rng_type)); @@ -224,6 +239,11 @@ void test_normal_parity(rocrand_rng_type rng_type, double eps, const std::vector& seeds = get_seeds()) { + if(rng_type == ROCRAND_RNG_PSEUDO_MT19937) + { + ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED(); + } + Type mean = static_cast(-12.0); Type stddev = static_cast(2.4); From 5b786cb28772beb0617f60084037449d1c5517e5 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 9 May 2024 08:54:50 +0000 Subject: [PATCH 31/54] fix review comments --- CHANGELOG.md | 4 +++ benchmark/benchmark_rocrand_host_api.cpp | 8 ++---- library/include/rocrand/rocrand.h | 15 +--------- library/src/rng/generator_type_mt19937.cpp | 2 ++ library/src/rng/generator_types.hpp | 2 ++ library/src/rng/mt19937.hpp | 31 ++++++++------------- library/src/rng/mt19937_octo_engine.hpp | 17 +++++------ test/internal/test_rocrand_mt19937_prng.cpp | 14 ++-------- 8 files changed, 33 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38c576334..1761a550f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Documentation for rocRAND is available at ## (Unreleased) rocRAND-3.2.0 for ROCm 6.3.0 +### Additions + +* Added host generator for MT19937 + ### Changes * `rocrand_discrete` for MTGP32, LFSR113 and ThreeFry generators now uses the alias method, which is faster than binary search in CDF. diff --git a/benchmark/benchmark_rocrand_host_api.cpp b/benchmark/benchmark_rocrand_host_api.cpp index 8b2b40f59..c29b21b91 100644 --- a/benchmark/benchmark_rocrand_host_api.cpp +++ b/benchmark/benchmark_rocrand_host_api.cpp @@ -179,6 +179,8 @@ int main(int argc, char* argv[]) std::vector benchmarked_engine_types{ROCRAND_RNG_PSEUDO_LFSR113, ROCRAND_RNG_PSEUDO_MRG31K3P, ROCRAND_RNG_PSEUDO_MRG32K3A, + ROCRAND_RNG_PSEUDO_MTGP32, + ROCRAND_RNG_PSEUDO_MT19937, ROCRAND_RNG_PSEUDO_PHILOX4_32_10, ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, @@ -190,12 +192,6 @@ int main(int argc, char* argv[]) ROCRAND_RNG_QUASI_SOBOL64, ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64}; - if(!benchmark_host) - { - benchmarked_engine_types.push_back(ROCRAND_RNG_PSEUDO_MTGP32); - benchmarked_engine_types.push_back(ROCRAND_RNG_PSEUDO_MT19937); - } - const std::map ordering_name_map{ {ROCRAND_ORDERING_PSEUDO_DEFAULT, "default"}, { ROCRAND_ORDERING_PSEUDO_LEGACY, "legacy"}, diff --git a/library/include/rocrand/rocrand.h b/library/include/rocrand/rocrand.h index 0f159b131..c92fbf209 100644 --- a/library/include/rocrand/rocrand.h +++ b/library/include/rocrand/rocrand.h @@ -170,20 +170,7 @@ rocrand_create_generator(rocrand_generator * generator, rocrand_rng_type rng_typ * and returns it in \p generator. This generator is executed on the host rather than * on a device, and it is enqueued on the stream associated with the generator. * - * Values for \p rng_type are: - * - ROCRAND_RNG_PSEUDO_XORWOW - * - ROCRAND_RNG_PSEUDO_MRG31K3P - * - ROCRAND_RNG_PSEUDO_MRG32K3A - * - ROCRAND_RNG_PSEUDO_PHILOX4_32_10 - * - ROCRAND_RNG_PSEUDO_LFSR113 - * - ROCRAND_RNG_PSEUDO_THREEFRY2_32_20 - * - ROCRAND_RNG_PSEUDO_THREEFRY2_64_20 - * - ROCRAND_RNG_PSEUDO_THREEFRY4_32_20 - * - ROCRAND_RNG_PSEUDO_THREEFRY4_64_20 - * - ROCRAND_RNG_QUASI_SOBOL32 - * - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32 - * - ROCRAND_RNG_QUASI_SOBOL64 - * - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64 + * All generators are supported. * * \param generator - Pointer to generator * \param rng_type - Type of generator to create diff --git a/library/src/rng/generator_type_mt19937.cpp b/library/src/rng/generator_type_mt19937.cpp index f48105da6..3c72b5811 100644 --- a/library/src/rng/generator_type_mt19937.cpp +++ b/library/src/rng/generator_type_mt19937.cpp @@ -26,5 +26,7 @@ namespace rocrand_impl::host { template struct generator_type; +template struct generator_type>; +template struct generator_type>; } // namespace rocrand_impl::host diff --git a/library/src/rng/generator_types.hpp b/library/src/rng/generator_types.hpp index 9a09c0298..995ee131c 100644 --- a/library/src/rng/generator_types.hpp +++ b/library/src/rng/generator_types.hpp @@ -45,6 +45,8 @@ extern template struct generator_type; extern template struct generator_type>; extern template struct generator_type>; extern template struct generator_type; +extern template struct generator_type>; +extern template struct generator_type>; extern template struct generator_type; extern template struct generator_type>; extern template struct generator_type>; diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 94caaa12b..e7bbf9b9b 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -89,7 +89,10 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, unsigned long long seed, const unsigned int* __restrict__ jump) { -#if !defined(__HIP_DEVICE_COMPILE__) +#if defined(__HIP_DEVICE_COMPILE__) + static constexpr bool isDevice = true; +#else + static constexpr bool isDevice = false; if(thread_idx.x > 0) { return; @@ -130,9 +133,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, } } -#if defined(__HIP_DEVICE_COMPILE__) - __syncthreads(); -#endif + system::syncthreads{}(); // clang-format off for(unsigned int i = 0; i < items_per_thread; i++) @@ -154,9 +155,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, } // clang-format on -#if defined(__HIP_DEVICE_COMPILE__) - __syncthreads(); -#endif + system::syncthreads{}(); const unsigned int engine_id = block_idx.x; @@ -185,9 +184,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, { temp[i] = 0; } -#if defined(__HIP_DEVICE_COMPILE__) - __syncthreads(); -#endif + system::syncthreads{}(); const unsigned int* pf = jump + (r * (mt19937_jumps_radix - 1) + radix - 1) * mt19937_p_size; @@ -205,9 +202,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, = (t0 & mt19937_constants::upper_mask) | (t1 & mt19937_constants::lower_mask); temp[ptr] = tm ^ (y >> 1) ^ ((y & 0x1U) ? mt19937_constants::matrix_a : 0); } -#if defined(__HIP_DEVICE_COMPILE__) - __syncthreads(); -#endif + system::syncthreads{}(); ptr = wrap_n(ptr + 1); if((pf[pfi / 32] >> (pfi % 32)) & 1) @@ -230,9 +225,7 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, } #endif } -#if defined(__HIP_DEVICE_COMPILE__) - __syncthreads(); -#endif + system::syncthreads{}(); } } @@ -253,9 +246,8 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, } #if !defined(__HIP_DEVICE_COMPILE__) } -#else - __syncthreads(); #endif + system::syncthreads{}(); } // Save state @@ -495,7 +487,7 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, #pragma unroll for(size_t i = 0; i < 8; ++i) { - thread_engines[i] = accessor.load(block_idx.x * block_dim.x + thread_idx.x + i); + thread_engines[i] = accessor.load(block_idx.x * block_dim.x + thread_idx.x + i); } #endif // clang-format on @@ -508,7 +500,6 @@ __host__ __device__ inline void generate_long_mt19937(dim3 block_idx, if(start_input > 0) { #if !defined(__HIP_DEVICE_COMPILE__) -#pragma unroll for(unsigned int warp_lane = 0; warp_lane < 8; warp_lane++) { auto& input = inputs[warp_lane]; diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index cd32eb088..fb5bfee0c 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -220,19 +220,20 @@ struct mt19937_octo_engine m_state.mt[idx_i + j] = comp(m_state.mt[idx_i + j], last_dep, m_state.mt[idx_m + j]); } - static void comp_vector(unsigned int idx_i, - unsigned int idx_m, - unsigned int last_dep_tid_7, - mt19937_octo_engine thread_engines[8]) + __host__ static void comp_vector(unsigned int idx_i, + unsigned int idx_m, + unsigned int last_dep_tid_7, + mt19937_octo_engine (&thread_engines)[8]) { + static constexpr unsigned int numberOfLanes = 8; // communicate the dependency for the last value - unsigned int last_deps[8]; - for(int i = 0; i < 8; ++i) + unsigned int last_deps[numberOfLanes]; + for(unsigned int i = 0; i < numberOfLanes; ++i) { last_deps[i] = thread_engines[(i + 1) % 8].m_state.mt[idx_i]; } - for(int i = 0; i < 8; ++i) + for(unsigned int i = 0; i < numberOfLanes; ++i) { // thread 7 needs a special value that does not fit the pattern unsigned int last_dep = i == 7 ? last_dep_tid_7 : last_deps[i]; @@ -401,7 +402,7 @@ struct mt19937_octo_engine comp_vector(tid, i568, i341, v000); } - static void gen_next_n(mt19937_octo_engine thread_engines[8]) + static void gen_next_n(mt19937_octo_engine (&thread_engines)[8]) { // compute eleven vectors that follow a regular pattern and compute // eight special values for a total of n new elements. diff --git a/test/internal/test_rocrand_mt19937_prng.cpp b/test/internal/test_rocrand_mt19937_prng.cpp index c3e3d60a4..51b610115 100644 --- a/test/internal/test_rocrand_mt19937_prng.cpp +++ b/test/internal/test_rocrand_mt19937_prng.cpp @@ -749,12 +749,7 @@ TYPED_TEST(mt19937_generator_engine_tests, subsequence_test) d_engines, seed, d_mt19937_jump); - if(status != ROCRAND_STATUS_SUCCESS) - { - std::cout << "rocRAND error code: " << status << " while calling: jump_ahead_mt19937" - << std::endl; - exit(status); - } + ASSERT_EQ(status, ROCRAND_STATUS_SUCCESS); octo_engine_type* d_octo_engines{}; HIP_CHECK(hipMalloc(&d_octo_engines, @@ -1172,12 +1167,7 @@ TYPED_TEST(mt19937_generator_engine_tests, jump_ahead_test) d_engines1, seed, d_mt19937_jump); - if(status != ROCRAND_STATUS_SUCCESS) - { - std::cout << "rocRAND error code: " << status - << " while calling: jump_ahead_mt19937" << std::endl; - exit(status); - } + ASSERT_EQ(status, ROCRAND_STATUS_SUCCESS); }); std::vector h_engines1(generator_count * n); From fe1650a058f976d2e5fe97bffd4193921569e86d Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 16 May 2024 09:49:53 +0000 Subject: [PATCH 32/54] remove synchronization from host_system::launch function add synchronization to host_system alloc, free and memcpy --- library/src/rng/mt19937_octo_engine.hpp | 2 +- library/src/rng/system.hpp | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index fb5bfee0c..c9a591cab 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -230,7 +230,7 @@ struct mt19937_octo_engine unsigned int last_deps[numberOfLanes]; for(unsigned int i = 0; i < numberOfLanes; ++i) { - last_deps[i] = thread_engines[(i + 1) % 8].m_state.mt[idx_i]; + last_deps[i] = thread_engines[(i + 1) % numberOfLanes].m_state.mt[idx_i]; } for(unsigned int i = 0; i < numberOfLanes; ++i) diff --git a/library/src/rng/system.hpp b/library/src/rng/system.hpp index 34e74a9cc..7d692c8bd 100644 --- a/library/src/rng/system.hpp +++ b/library/src/rng/system.hpp @@ -63,6 +63,12 @@ struct host_system template static rocrand_status alloc(T** ptr, size_t n) { + hipError_t status = hipDeviceSynchronize(); + if(status != hipSuccess) + { + return ROCRAND_STATUS_ALLOCATION_FAILED; + } + *ptr = new(std::nothrow) T[n]; if(!*ptr) { @@ -74,11 +80,21 @@ struct host_system template static void free(T* ptr) { + hipError_t status = hipDeviceSynchronize(); + if(status != hipSuccess) + { + return; + } delete[] ptr; } static rocrand_status memcpy(void* dst, const void* src, size_t size, hipMemcpyKind /*kind*/) { + hipError_t status = hipDeviceSynchronize(); + if(status != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } std::memcpy(dst, src, size); return ROCRAND_STATUS_SUCCESS; } @@ -159,10 +175,9 @@ struct host_system if constexpr(UseHostFunc) { - hipError_t status = hipLaunchHostFunc(stream, kernel_callback, kernel_args); - hipError_t sync_status = hipStreamSynchronize(stream); + hipError_t status = hipLaunchHostFunc(stream, kernel_callback, kernel_args); - if(status != hipSuccess || sync_status != hipSuccess) + if(status != hipSuccess) { // At this point, if the callback has not been invoked, there will be a memory // leak. It is unclear whether hipLaunchHostFunc can return an error after the From e2518b9f4c6df24503783e0974e7ef9675f70208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Wed, 22 May 2024 11:58:50 +0000 Subject: [PATCH 33/54] Implement asynchronous initialization of poisson distribution --- library/src/rng/distribution/discrete.hpp | 358 ++++++++++------- library/src/rng/distribution/poisson.hpp | 362 ++++++++++++++---- library/src/rng/generator_type.hpp | 4 +- library/src/rng/lfsr113.hpp | 34 +- library/src/rng/mrg.hpp | 36 +- library/src/rng/mt19937.hpp | 35 +- library/src/rng/mtgp32.hpp | 40 +- library/src/rng/philox4x32_10.hpp | 33 +- library/src/rng/sobol.hpp | 33 +- library/src/rng/system.hpp | 77 ++++ library/src/rng/threefry.hpp | 35 +- library/src/rng/xorwow.hpp | 33 +- library/src/rocrand.cpp | 57 ++- test/internal/test_poisson_distribution.cpp | 58 ++- test/internal/test_rocrand_generator_type.cpp | 6 + test/test_common.hpp | 23 ++ 16 files changed, 885 insertions(+), 339 deletions(-) diff --git a/library/src/rng/distribution/discrete.hpp b/library/src/rng/distribution/discrete.hpp index a8f3333b8..9e3bf1779 100644 --- a/library/src/rng/distribution/discrete.hpp +++ b/library/src/rng/distribution/discrete.hpp @@ -25,9 +25,11 @@ #include #include +#include #include #include +#include #include // Alias method @@ -48,174 +50,175 @@ enum discrete_method DISCRETE_METHOD_UNIVERSAL = DISCRETE_METHOD_ALIAS | DISCRETE_METHOD_CDF }; -template -class discrete_distribution_base : public rocrand_discrete_distribution_st +/// \brief Encapsulates a `rocrand_discrete_distribution_st` and makes it possible +/// to sample the discrete distribution in the host generators. +template +class discrete_distribution_base { public: - - static constexpr unsigned int input_width = 1; - static constexpr unsigned int output_width = 1; + static constexpr inline unsigned int input_width = 1; + static constexpr inline unsigned int output_width = 1; // rocrand_discrete_distribution_st is a struct - discrete_distribution_base() // cppcheck-suppress uninitDerivedMemberVar - { - size = 0; - offset = 0; - probability = NULL; - alias = NULL; - cdf = NULL; - } - - discrete_distribution_base(const double* probabilities, unsigned int size, unsigned int offset) - : discrete_distribution_base() - { - std::vector p(probabilities, probabilities + size); - - init(p, size, offset); - } - - __host__ __device__ ~discrete_distribution_base() {} - - void deallocate() - { - // Explicit deallocation is used because the object is copied - // multiple times inside hipLaunchKernelGGL, and destructor is called - // for all copies (we can't use c++ smart pointers for device pointers) - if (IsHostSide) - { - if (probability != NULL) - { - delete[] probability; - } - if (alias != NULL) - { - delete[] alias; - } - if (cdf != NULL) - { - delete[] cdf; - } - } - else - { - if (probability != NULL) - { - ROCRAND_HIP_FATAL_ASSERT(hipFree(probability)); - } - if (alias != NULL) - { - ROCRAND_HIP_FATAL_ASSERT(hipFree(alias)); - } - if (cdf != NULL) - { - ROCRAND_HIP_FATAL_ASSERT(hipFree(cdf)); - } - } - probability = NULL; - alias = NULL; - cdf = NULL; - } + explicit discrete_distribution_base(const rocrand_discrete_distribution_st& distribution) + : m_distribution(distribution) + {} template __forceinline__ __host__ __device__ unsigned int operator()(T x) const { - if((Method & DISCRETE_METHOD_ALIAS) != 0) + if constexpr((Method & DISCRETE_METHOD_ALIAS) != 0) { - return rocrand_device::detail::discrete_alias(x, *this); + return rocrand_device::detail::discrete_alias(x, m_distribution); } else { - return rocrand_device::detail::discrete_cdf(x, *this); + return rocrand_device::detail::discrete_cdf(x, m_distribution); } } template - __host__ __device__ void operator()(const T (&input)[1], unsigned int output[1]) const + __forceinline__ __host__ __device__ void operator()(const T (&input)[1], + unsigned int output[1]) const { output[0] = (*this)(input[0]); } -protected: +private: + rocrand_discrete_distribution_st m_distribution; +}; - void init(std::vector p, - const unsigned int size, - const unsigned int offset) +/// \brief A collection of static methods for constructing and destroying +/// instances of `rocrand_discrete_distribution_st`. +/// \tparam Method Controls which members of the produced `rocrand_discrete_distribution_st` +/// are populated. +/// \tparam IsHostSide Controls whether the allocated and filled memory blocks reside +/// on the host or on the device. +template +class discrete_distribution_factory +{ +public: + /// \brief Allocates and populates an instance of `rocrand_discrete_distribution_st`. + /// \note `allocate` and `normalize` are called by this function, therefore those + /// doesn't need to be called separately. + /// \note The produced `rocrand_discrete_distribution_st` MUST be deallocated by the matching + /// `deallocate` function when it's no longer used. + /// \param p The probability array of the discrete distribution. + /// \param size The size of the input probability array. This must not exceed the size of `p`. + /// \param offset The offset of the input probability array. + /// \param distribution [out] The allocated and populated discrete distribution instance. + /// \return `ROCRAND_STATUS_SUCCESS` if the operation is successful, otherwise an error code from the + /// first failing procedure. + static rocrand_status create(std::vector p, + const unsigned int size, + const unsigned int offset, + rocrand_discrete_distribution_st& distribution) { - this->size = size; - this->offset = offset; - - deallocate(); - allocate(); - normalize(p); - if((Method & DISCRETE_METHOD_ALIAS) != 0) + rocrand_status status = allocate(size, offset, distribution); + if(status != ROCRAND_STATUS_SUCCESS) { - create_alias_table(p); + return status; } - if((Method & DISCRETE_METHOD_CDF) != 0) + normalize(p, size); + if constexpr((Method & DISCRETE_METHOD_ALIAS) != 0) { - create_cdf(p); + std::vector h_probability(size); + std::vector h_alias(size); + create_alias_table(p, size, h_probability.begin(), h_alias.begin()); + status = copy_alias_table(distribution, h_probability, h_alias); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } } + if constexpr((Method & DISCRETE_METHOD_CDF) != 0) + { + std::vector h_cdf(size); + create_cdf(p, size, h_cdf.begin()); + status = copy_cdf(distribution, h_cdf); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + } + return ROCRAND_STATUS_SUCCESS; } - void allocate() + /// \brief Frees the allocated memory associated with the passed distribution that was + /// previously created by `create` or `allocate`. + /// \param [in,out] distribution The distribution to deallocate. + /// The fields of the distribution are set to default values. + /// \return `ROCRAND_STATUS_SUCCESS` if the operation is successful, otherwise an error code from the + /// first failing procedure. + static rocrand_status deallocate(rocrand_discrete_distribution_st& distribution) { - if (IsHostSide) + if constexpr(IsHostSide) { - if((Method & DISCRETE_METHOD_ALIAS) != 0) - { - probability = new double[size]; - alias = new unsigned int[size]; - } - if((Method & DISCRETE_METHOD_CDF) != 0) - { - cdf = new double[size]; - } + delete[] distribution.probability; + delete[] distribution.alias; + delete[] distribution.cdf; } else { hipError_t error; - if((Method & DISCRETE_METHOD_ALIAS) != 0) + error = hipFree(distribution.probability); + if(error != hipSuccess) { - error = hipMalloc(&probability, sizeof(double) * size); - if (error != hipSuccess) - { - throw ROCRAND_STATUS_ALLOCATION_FAILED; - } - error = hipMalloc(&alias, sizeof(unsigned int) * size); - if (error != hipSuccess) - { - throw ROCRAND_STATUS_ALLOCATION_FAILED; - } + return ROCRAND_STATUS_INTERNAL_ERROR; } - if((Method & DISCRETE_METHOD_CDF) != 0) + error = hipFree(distribution.alias); + if(error != hipSuccess) { - error = hipMalloc(&cdf, sizeof(double) * size); - if (error != hipSuccess) - { - throw ROCRAND_STATUS_ALLOCATION_FAILED; - } + return ROCRAND_STATUS_INTERNAL_ERROR; + } + error = hipFree(distribution.cdf); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; } } + + distribution = {}; + return ROCRAND_STATUS_SUCCESS; } - void normalize(std::vector& p) const + /// \brief Normalizes the values in probability vector `p`. + /// \param p [in,out] p The probability vector to normalize. + /// \param size The size of the probability vector. + /// It MUST NOT be larger than the size of `p`. + static void normalize(std::vector& p, const unsigned int size) { double sum = 0.0; - for (unsigned int i = 0; i < size; i++) + for(unsigned int i = 0; i < size; i++) { sum += p[i]; } - // Normalize probabilities - for (unsigned int i = 0; i < size; i++) + for(unsigned int i = 0; i < size; i++) { p[i] /= sum; } } - void create_alias_table(std::vector p) + /// \brief Computes the alias table from the probability vector for a discrete distribution. + /// \tparam ProbabilityIt The type of the output iterator to which the calculated probabilities are written. + /// Must be a RandomAccessIterator. + /// \tparam AliasIt The type of the output iterator to which the calculated aliases are written. + /// Must be a RandomAccessIterator. + /// \param p The normalized probability vector. + /// \param size The size of the probability vector. + /// It MUST NOT be larger than the size of `p`. + /// \param h_probability Probabilities output iterator. + /// \param h_alias Aliases output iterator. + template + static void create_alias_table(std::vector p, + const unsigned int size, + ProbabilityIt h_probability, + AliasIt h_alias) { - std::vector h_probability(size); - std::vector h_alias(size); + static_assert( + std::is_same_v::value_type>); + static_assert( + std::is_same_v::value_type>); const double average = 1.0 / size; @@ -259,52 +262,135 @@ class discrete_distribution_base : public rocrand_discrete_distribution_st { h_probability[i] = 1.0; } + } + + /// \brief Computes the CDF (cumulative distribution function) table from the + /// probability vector for a discrete distribution. + /// \tparam CdfIt The type of the output iterator to which the calculated CDF values are written. + /// Must be a RandomAccessIterator. + /// \param p The normalized probability vector. + /// \param size The size of the probability vector. + /// It MUST NOT be larger than the size of `p`. + /// \param h_cdf CDF output iterator. + template + static void create_cdf(const std::vector& p, const unsigned int size, CdfIt h_cdf) + { + static_assert(std::is_same_v::value_type>); - if (IsHostSide) + double sum = 0.0; + for(unsigned int i = 0; i < size; i++) { - std::copy(h_probability.begin(), h_probability.end(), probability); - std::copy(h_alias.begin(), h_alias.end(), alias); + sum += p[i]; + h_cdf[i] = sum; + } + } + + /// \brief Allocates the required amount of memory for a `rocrand_discrete_distribution_st`. + /// \param size The size of the input probability array. + /// \param offset The offset of the input probability array. + /// \param [out] distribution The distribution to allocate. + /// \return `ROCRAND_STATUS_SUCCESS` if the operation is successful, otherwise an error code from the + /// first failing procedure. + static rocrand_status allocate(const unsigned int size, + const unsigned int offset, + rocrand_discrete_distribution_st& distribution) + { + distribution = {}; + distribution.size = size; + distribution.offset = offset; + if constexpr(IsHostSide) + { + if constexpr((Method & DISCRETE_METHOD_ALIAS) != 0) + { + distribution.probability = new double[distribution.size]; + distribution.alias = new unsigned int[distribution.size]; + } + if constexpr((Method & DISCRETE_METHOD_CDF) != 0) + { + distribution.cdf = new double[distribution.size]; + } } else { hipError_t error; - error = hipMemcpy(probability, h_probability.data(), sizeof(double) * size, hipMemcpyDefault); - if (error != hipSuccess) + if constexpr((Method & DISCRETE_METHOD_ALIAS) != 0) { - throw ROCRAND_STATUS_INTERNAL_ERROR; + error = hipMalloc(&distribution.probability, sizeof(double) * distribution.size); + if(error != hipSuccess) + { + return ROCRAND_STATUS_ALLOCATION_FAILED; + } + error = hipMalloc(&distribution.alias, sizeof(unsigned int) * distribution.size); + if(error != hipSuccess) + { + return ROCRAND_STATUS_ALLOCATION_FAILED; + } } - error = hipMemcpy(alias, h_alias.data(), sizeof(unsigned int) * size, hipMemcpyDefault); - if (error != hipSuccess) + if constexpr((Method & DISCRETE_METHOD_CDF) != 0) { - throw ROCRAND_STATUS_INTERNAL_ERROR; + error = hipMalloc(&distribution.cdf, sizeof(double) * distribution.size); + if(error != hipSuccess) + { + return ROCRAND_STATUS_ALLOCATION_FAILED; + } } } + return ROCRAND_STATUS_SUCCESS; } - void create_cdf(std::vector p) +private: + static rocrand_status copy_alias_table(const rocrand_discrete_distribution_st& distribution, + const std::vector& h_probability, + const std::vector& h_alias) { - std::vector h_cdf(size); - - double sum = 0.0; - for (unsigned int i = 0; i < size; i++) + if constexpr(IsHostSide) { - sum += p[i]; - h_cdf[i] = sum; + std::copy(h_probability.begin(), h_probability.end(), distribution.probability); + std::copy(h_alias.begin(), h_alias.end(), distribution.alias); } + else + { + hipError_t error; + error = hipMemcpy(distribution.probability, + h_probability.data(), + sizeof(double) * distribution.size, + hipMemcpyHostToDevice); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + error = hipMemcpy(distribution.alias, + h_alias.data(), + sizeof(unsigned int) * distribution.size, + hipMemcpyHostToDevice); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + } + return ROCRAND_STATUS_SUCCESS; + } - if (IsHostSide) + static rocrand_status copy_cdf(const rocrand_discrete_distribution_st& distribution, + const std::vector& h_cdf) + { + if constexpr(IsHostSide) { - std::copy(h_cdf.begin(), h_cdf.end(), cdf); + std::copy(h_cdf.begin(), h_cdf.end(), distribution.cdf); } else { hipError_t error; - error = hipMemcpy(cdf, h_cdf.data(), sizeof(double) * size, hipMemcpyDefault); + error = hipMemcpy(distribution.cdf, + h_cdf.data(), + sizeof(double) * distribution.size, + hipMemcpyHostToDevice); if (error != hipSuccess) { - throw ROCRAND_STATUS_INTERNAL_ERROR; + return ROCRAND_STATUS_INTERNAL_ERROR; } } + return ROCRAND_STATUS_SUCCESS; } }; diff --git a/library/src/rng/distribution/poisson.hpp b/library/src/rng/distribution/poisson.hpp index 764a42192..a34e1bd66 100644 --- a/library/src/rng/distribution/poisson.hpp +++ b/library/src/rng/distribution/poisson.hpp @@ -21,144 +21,358 @@ #ifndef ROCRAND_RNG_DISTRIBUTION_POISSON_H_ #define ROCRAND_RNG_DISTRIBUTION_POISSON_H_ +#include "../system.hpp" #include "discrete.hpp" #include +#include +#include #include #include +#include #include +#include +#include +#include +#include #include namespace rocrand_impl::host { -template -class poisson_distribution : public discrete_distribution_base +template +class poisson_distribution : private discrete_distribution_base { public: - typedef discrete_distribution_base base; + static constexpr inline unsigned int input_width = 1; + static constexpr inline unsigned int output_width = 1; - poisson_distribution() : base() {} + using base_t = discrete_distribution_base; - explicit poisson_distribution(double lambda) : poisson_distribution() + poisson_distribution(const rocrand_discrete_distribution_st& distribution, const double lambda) + : base_t(distribution), m_lambda(lambda) + {} + + template + __forceinline__ __host__ __device__ unsigned int operator()(T x) const { - set_lambda(lambda); + if(m_lambda > rocrand_device::detail::lambda_threshold_huge) + { + const double normal_d = rocrand_device::detail::normal_distribution_double(x); + return static_cast(round(sqrt(m_lambda) * normal_d + m_lambda)); + } + else + { + return base_t::operator()(x); + } } - void set_lambda(double lambda) + template + __forceinline__ __host__ __device__ void operator()(const T (&input)[1], + unsigned int (&output)[1]) const { - const size_t capacity = - 2 * static_cast(16.0 * (2.0 + std::sqrt(lambda))); - std::vector p(capacity); - - calculate_probabilities(p, capacity, lambda); - - this->init(p, this->size, this->offset); + output[0] = (*this)(input[0]); } -protected: +private: + double m_lambda; +}; - void calculate_probabilities(std::vector& p, const size_t capacity, - const double lambda) - { - const double p_epsilon = 1e-12; - const double log_lambda = std::log(lambda); +[[nodiscard]] inline std::vector + calculate_poisson_probabilities(const double lambda, unsigned int& size, unsigned int& offset) +{ + const size_t capacity = 2 * static_cast(16.0 * (2.0 + std::sqrt(lambda))); + std::vector p(capacity); - const int left = static_cast(std::floor(lambda)) - capacity / 2; + const double p_epsilon = 1e-12; + const double log_lambda = std::log(lambda); - // Calculate probabilities starting from mean in both directions, - // because only a small part of [0, lambda] has non-negligible values - // (> p_epsilon). + const int left = static_cast(std::floor(lambda)) - capacity / 2; - int lo = 0; - for (int i = capacity / 2; i >= 0; i--) - { - const double x = left + i; - const double pp = std::exp(x * log_lambda - std::lgamma(x + 1.0) - lambda); - if (pp < p_epsilon) - { - lo = i + 1; - break; - } - p[i] = pp; - } + // Calculate probabilities starting from mean in both directions, + // because only a small part of [0, lambda] has non-negligible values + // (> p_epsilon). - int hi = capacity - 1; - for (int i = capacity / 2 + 1; i < static_cast(capacity); i++) + int lo = 0; + for(int i = capacity / 2; i >= 0; i--) + { + const double x = left + i; + const double pp = std::exp(x * log_lambda - std::lgamma(x + 1.0) - lambda); + if(pp < p_epsilon) { - const double x = left + i; - const double pp = std::exp(x * log_lambda - std::lgamma(x + 1.0) - lambda); - if (pp < p_epsilon) - { - hi = i - 1; - break; - } - p[i] = pp; + lo = i + 1; + break; } + p[i] = pp; + } - for (int i = lo; i <= hi; i++) + int hi = capacity - 1; + for(int i = capacity / 2 + 1; i < static_cast(capacity); i++) + { + const double x = left + i; + const double pp = std::exp(x * log_lambda - std::lgamma(x + 1.0) - lambda); + if(pp < p_epsilon) { - p[i - lo] = p[i]; + hi = i - 1; + break; } + p[i] = pp; + } - this->size = hi - lo + 1; - this->offset = left + lo; + for(int i = lo; i <= hi; i++) + { + p[i - lo] = p[i]; } -}; + + size = hi - lo + 1; + offset = left + lo; + + return p; +} + +inline void calculate_poisson_size(const double lambda, unsigned int& size, unsigned int& offset) +{ + (void)calculate_poisson_probabilities(lambda, size, offset); +} // Handles caching of precomputed tables for the distribution and recomputes // them only when lambda is changed (as these computations, device memory // allocations and copying take time). -template +template class poisson_distribution_manager { public: - poisson_distribution dis; + using factory_t = discrete_distribution_factory; + using distribution_t = poisson_distribution; poisson_distribution_manager() = default; poisson_distribution_manager(const poisson_distribution_manager&) = delete; poisson_distribution_manager(poisson_distribution_manager&& other) - : dis(other.dis), lambda(other.lambda) - { - // For now, we didn't make poisson_distribution move-only - // We copied the pointers of dis. Prevent deallocation by the destructor of other - other.dis = {}; - } + : m_initialized(std::exchange(other.m_initialized, false)) + , m_is_host_func_blocking(other.m_is_host_func_blocking) + , m_stream(other.m_stream) + , m_probability(std::exchange(other.m_probability, nullptr)) + , m_alias(std::exchange(other.m_alias, nullptr)) + , m_cdf(std::exchange(other.m_cdf, nullptr)) + , m_lambda(other.m_lambda) + , m_distribution(std::exchange(other.m_distribution, {})) + {} poisson_distribution_manager& operator=(const poisson_distribution_manager&) = delete; poisson_distribution_manager& operator=(poisson_distribution_manager&& other) { - dis = other.dis; - lambda = other.lambda; - - // For now, we didn't make poisson_distribution move-only - // We copied the pointers of dis. Prevent deallocation by the destructor of other - other.dis = {}; + m_initialized = other.m_initialized; + m_is_host_func_blocking = other.m_is_host_func_blocking; + m_stream = other.m_stream; + m_lambda = other.lambda; + std::swap(m_probability, other.m_probability); + std::swap(m_alias, other.m_alias); + std::swap(m_cdf, other.m_cdf); + std::swap(m_distribution, other.m_distribution); return *this; } ~poisson_distribution_manager() { - dis.deallocate(); + factory_t::deallocate(m_distribution); + if constexpr((Method & DISCRETE_METHOD_ALIAS) != 0) + { + ROCRAND_HIP_FATAL_ASSERT(hipHostFree(m_probability)); + ROCRAND_HIP_FATAL_ASSERT(hipHostFree(m_alias)); + } + if constexpr((Method & DISCRETE_METHOD_CDF) != 0) + { + ROCRAND_HIP_FATAL_ASSERT(hipHostFree(m_cdf)); + } } - void set_lambda(double new_lambda) + rocrand_status init() { - const bool changed = lambda != new_lambda; - if (changed) + if(m_initialized) { - lambda = new_lambda; - dis.set_lambda(lambda); + return ROCRAND_STATUS_SUCCESS; } + + unsigned int size; + unsigned int offset; + calculate_poisson_size(rocrand_device::detail::lambda_threshold_huge, size, offset); + if constexpr((Method & DISCRETE_METHOD_ALIAS) != 0) + { + hipError_t error = hipHostMalloc(&m_probability, size * sizeof(*m_probability)); + if(error != hipSuccess) + { + return ROCRAND_STATUS_ALLOCATION_FAILED; + } + error = hipHostMalloc(&m_alias, size * sizeof(*m_alias)); + if(error != hipSuccess) + { + return ROCRAND_STATUS_ALLOCATION_FAILED; + } + } + if constexpr((Method & DISCRETE_METHOD_CDF) != 0) + { + const hipError_t error = hipHostMalloc(&m_cdf, size * sizeof(*m_cdf)); + if(error != hipSuccess) + { + return ROCRAND_STATUS_ALLOCATION_FAILED; + } + } + const rocrand_status status = factory_t::allocate(size, offset, m_distribution); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + + m_initialized = true; + return ROCRAND_STATUS_SUCCESS; + } + + rocrand_status set_stream(const hipStream_t stream) + { + const rocrand_status status + = System::is_host_func_blocking(stream, m_is_host_func_blocking); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + m_stream = stream; + return ROCRAND_STATUS_SUCCESS; + } + + std::variant get_distribution(const double lambda) + { + if(!m_initialized) + { + const rocrand_status status = init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + } + + std::unique_lock lock(m_mutex, std::defer_lock_t{}); + if(!m_is_host_func_blocking) + { + lock.lock(); + } + + const bool changed = lambda != m_lambda; + if(changed && lambda <= rocrand_device::detail::lambda_threshold_huge) + { + auto arg = std::make_unique( + update_discrete_distribution_arg{lambda, this}); + const rocrand_status status + = System::launch_host_func(m_stream, update_discrete_distribution, arg.release()); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + if constexpr(System::is_device() && (Method & DISCRETE_METHOD_ALIAS)) + { + hipError_t error + = hipMemcpyAsync(m_distribution.probability, + m_probability, + m_distribution.size * sizeof(*m_distribution.probability), + hipMemcpyHostToDevice, + m_stream); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + error = hipMemcpyAsync(m_distribution.alias, + m_alias, + m_distribution.size * sizeof(*m_distribution.alias), + hipMemcpyHostToDevice, + m_stream); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + } + if constexpr(System::is_device() && (Method & DISCRETE_METHOD_CDF)) + { + const hipError_t error + = hipMemcpyAsync(m_distribution.cdf, + m_cdf, + m_distribution.size * sizeof(*m_distribution.cdf), + hipMemcpyHostToDevice, + m_stream); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + } + } + + rocrand_discrete_distribution_st distribution_copy = m_distribution; + calculate_poisson_size(lambda, distribution_copy.size, distribution_copy.offset); + return distribution_t(distribution_copy, lambda); } private: - double lambda = 0.0; + bool m_initialized = false; + bool m_is_host_func_blocking = true; + hipStream_t m_stream = 0; + std::mutex m_mutex; + double* m_probability = nullptr; + unsigned int* m_alias = nullptr; + double* m_cdf = nullptr; + double m_lambda = 0; + rocrand_discrete_distribution_st m_distribution = {}; + + struct update_discrete_distribution_arg + { + double lambda; + poisson_distribution_manager* manager; + }; + + static void update_discrete_distribution(void* user_data) + { + std::unique_ptr arg( + reinterpret_cast(user_data)); + std::unique_lock lock(arg->manager->m_mutex, std::defer_lock_t{}); + if(!arg->manager->m_is_host_func_blocking) + { + lock.lock(); + } + unsigned int size; + unsigned int offset; + std::vector poisson_probabilities + = calculate_poisson_probabilities(arg->lambda, size, offset); + assert(size <= arg->manager->m_distribution.size); + factory_t::normalize(poisson_probabilities, size); + if constexpr((Method & DISCRETE_METHOD_ALIAS) != 0) + { + factory_t::create_alias_table(poisson_probabilities, + size, + arg->manager->m_probability, + arg->manager->m_alias); + } + if constexpr((Method & DISCRETE_METHOD_CDF) != 0) + { + factory_t::create_cdf(poisson_probabilities, size, arg->manager->m_cdf); + } + arg->manager->m_lambda = arg->lambda; + if constexpr(!System::is_device()) + { + if constexpr((Method & DISCRETE_METHOD_ALIAS) != 0) + { + std::copy_n(arg->manager->m_probability, + size, + arg->manager->m_distribution.probability); + std::copy_n(arg->manager->m_alias, size, arg->manager->m_distribution.alias); + } + if constexpr((Method & DISCRETE_METHOD_CDF) != 0) + { + std::copy_n(arg->manager->m_cdf, size, arg->manager->m_distribution.cdf); + } + } + } }; // Mrg32k3a and Mrg31k3p @@ -166,13 +380,13 @@ class poisson_distribution_manager template struct mrg_engine_poisson_distribution { - using distribution_type = poisson_distribution; + using distribution_type = poisson_distribution; static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; distribution_type dis; - mrg_engine_poisson_distribution(distribution_type dis) : dis(dis) {} + explicit mrg_engine_poisson_distribution(distribution_type dis) : dis(dis) {} __host__ __device__ void operator()(const unsigned int (&input)[1], unsigned int (&output)[1]) const diff --git a/library/src/rng/generator_type.hpp b/library/src/rng/generator_type.hpp index 0363730de..559e2aec7 100644 --- a/library/src/rng/generator_type.hpp +++ b/library/src/rng/generator_type.hpp @@ -44,7 +44,7 @@ struct rocrand_generator_base_type virtual rocrand_status set_order(rocrand_ordering order) = 0; virtual hipStream_t get_stream() const = 0; - virtual void set_stream(hipStream_t stream) = 0; + virtual rocrand_status set_stream(hipStream_t stream) = 0; virtual rocrand_status set_dimensions(unsigned int dimensions) = 0; @@ -128,7 +128,7 @@ struct generator_type : rocrand_generator_base_type return m_generator.get_stream(); } - void set_stream(hipStream_t stream) override final + rocrand_status set_stream(hipStream_t stream) override final { return m_generator.set_stream(stream); } diff --git a/library/src/rng/lfsr113.hpp b/library/src/rng/lfsr113.hpp index 6cb86639f..8473d466d 100644 --- a/library/src/rng/lfsr113.hpp +++ b/library/src/rng/lfsr113.hpp @@ -36,6 +36,7 @@ #include #include +#include namespace rocrand_impl::host { @@ -168,6 +169,9 @@ class lfsr113_generator_template : public generator_impl_base using system_type = System; using base_type = generator_impl_base; using engine_type = lfsr113_device_engine; + using poisson_distribution_manager_t + = poisson_distribution_manager; + using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; lfsr113_generator_template(uint4 seeds = {ROCRAND_LFSR113_DEFAULT_SEED_X, ROCRAND_LFSR113_DEFAULT_SEED_Y, @@ -299,6 +303,17 @@ class lfsr113_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status set_stream(hipStream_t stream) + { + const rocrand_status status = m_poisson.set_stream(stream); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + base_type::set_stream(stream); + return ROCRAND_STATUS_SUCCESS; + } + rocrand_status init() { if(m_engines_initialized) @@ -344,6 +359,12 @@ class lfsr113_generator_template : public generator_impl_base return status; } + status = m_poisson.init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + m_engines_initialized = true; return ROCRAND_STATUS_SUCCESS; } @@ -435,15 +456,12 @@ class lfsr113_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int* data, size_t data_size, double lambda) { - try - { - m_poisson.set_lambda(lambda); - } - catch(rocrand_status status) + auto dis = m_poisson.get_distribution(lambda); + if(auto* error_status = std::get_if(&dis)) { - return status; + return *error_status; } - return generate(data, data_size, m_poisson.dis); + return generate(data, data_size, std::get(dis)); } private: @@ -454,7 +472,7 @@ class lfsr113_generator_template : public generator_impl_base uint4 m_seed; // For caching of Poisson for consecutive generations with the same lambda - poisson_distribution_manager m_poisson; + poisson_distribution_manager_t m_poisson; // m_seed from base_type // m_offset from base_type diff --git a/library/src/rng/mrg.hpp b/library/src/rng/mrg.hpp index bb68d40f5..9d474f04c 100644 --- a/library/src/rng/mrg.hpp +++ b/library/src/rng/mrg.hpp @@ -163,6 +163,9 @@ class mrg_generator_template : public generator_impl_base using base_type = generator_impl_base; using engine_type = Engine; using system_type = System; + using poisson_distribution_manager_t + = poisson_distribution_manager; + using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; mrg_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -265,6 +268,17 @@ class mrg_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status set_stream(hipStream_t stream) + { + const rocrand_status status = m_poisson.set_stream(stream); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + base_type::set_stream(stream); + return ROCRAND_STATUS_SUCCESS; + } + rocrand_status init() { if(m_engines_initialized) @@ -310,6 +324,12 @@ class mrg_generator_template : public generator_impl_base return status; } + status = m_poisson.init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + m_engines_initialized = true; return ROCRAND_STATUS_SUCCESS; } @@ -405,17 +425,13 @@ class mrg_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int* data, size_t data_size, double lambda) { - try + auto dis = m_poisson.get_distribution(lambda); + if(auto* error_status = std::get_if(&dis)) { - m_poisson.set_lambda(lambda); + return *error_status; } - catch(rocrand_status status) - { - return status; - } - mrg_engine_poisson_distribution distribution( - m_poisson.dis); - return generate(data, data_size, distribution); + mrg_engine_poisson_distribution mrg_dis(std::get(dis)); + return generate(data, data_size, mrg_dis); } private: @@ -439,7 +455,7 @@ class mrg_generator_template : public generator_impl_base unsigned long long m_seed; // For caching of Poisson for consecutive generations with the same lambda - poisson_distribution_manager m_poisson; + poisson_distribution_manager_t m_poisson; // m_seed from base_type // m_offset from base_type diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index e7bbf9b9b..a2cb3e46f 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -663,6 +663,9 @@ class mt19937_generator_template : public generator_impl_base using base_type = generator_impl_base; using octo_engine_type = mt19937_octo_engine; using system_type = System; + using poisson_distribution_manager_t + = poisson_distribution_manager; + using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; static constexpr inline unsigned int threads_per_generator = octo_engine_type::threads_per_generator; @@ -769,6 +772,17 @@ class mt19937_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status set_stream(hipStream_t stream) + { + const rocrand_status status = m_poisson.set_stream(stream); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + base_type::set_stream(stream); + return ROCRAND_STATUS_SUCCESS; + } + rocrand_status init() { if(m_engines_initialized) @@ -873,6 +887,12 @@ class mt19937_generator_template : public generator_impl_base system_type::free(d_engines); + status = m_poisson.init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + m_engines_initialized = true; m_start_input = 0; m_prev_input_width = 0; @@ -1050,15 +1070,12 @@ class mt19937_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int* data, size_t data_size, double lambda) { - try - { - m_poisson.set_lambda(lambda); - } - catch(rocrand_status status) + auto dis = m_poisson.get_distribution(lambda); + if(auto* error_status = std::get_if(&dis)) { - return status; + return *error_status; } - return generate(data, data_size, m_poisson.dis); + return generate(data, data_size, std::get(dis)); } private: @@ -1073,9 +1090,7 @@ class mt19937_generator_template : public generator_impl_base unsigned long long m_seed; // For caching of Poisson for consecutive generations with the same lambda - poisson_distribution_manager::value> - m_poisson; + poisson_distribution_manager_t m_poisson; /// Number of independent generators. Value changes generated number stream. unsigned int m_generator_count = 0; diff --git a/library/src/rng/mtgp32.hpp b/library/src/rng/mtgp32.hpp index 5055d99a9..0693bc78f 100644 --- a/library/src/rng/mtgp32.hpp +++ b/library/src/rng/mtgp32.hpp @@ -305,6 +305,9 @@ class mtgp32_generator_template : public generator_impl_base using base_type = generator_impl_base; using engine_type = mtgp32_device_engine; using system_type = System; + using poisson_distribution_manager_t + = poisson_distribution_manager; + using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; mtgp32_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -404,6 +407,17 @@ class mtgp32_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status set_stream(hipStream_t stream) + { + const rocrand_status status = m_poisson.set_stream(stream); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + base_type::set_stream(stream); + return ROCRAND_STATUS_SUCCESS; + } + rocrand_status init() { if (m_engines_initialized) @@ -441,6 +455,12 @@ class mtgp32_generator_template : public generator_impl_base return ROCRAND_STATUS_ALLOCATION_FAILED; } + status = m_poisson.init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + m_engines_initialized = true; return ROCRAND_STATUS_SUCCESS; @@ -534,15 +554,23 @@ class mtgp32_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int * data, size_t data_size, double lambda) { - try + // For an unknown reason, on CUDA, the initialization of the engines must precede + // the initialization of the poisson distribution, otherwise spurious miscalculations + // occur + if(!m_engines_initialized) { - m_poisson.set_lambda(lambda); + const auto status = init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } } - catch(rocrand_status status) + auto dis = m_poisson.get_distribution(lambda); + if(auto* error_status = std::get_if(&dis)) { - return status; + return *error_status; } - return generate(data, data_size, m_poisson.dis); + return generate(data, data_size, std::get(dis)); } private: @@ -553,7 +581,7 @@ class mtgp32_generator_template : public generator_impl_base unsigned long long m_seed; // For caching of Poisson for consecutive generations with the same lambda - poisson_distribution_manager m_poisson; + poisson_distribution_manager_t m_poisson; // m_seed from base_type // m_offset from base_type diff --git a/library/src/rng/philox4x32_10.hpp b/library/src/rng/philox4x32_10.hpp index ccc70c543..a4c02b1f0 100644 --- a/library/src/rng/philox4x32_10.hpp +++ b/library/src/rng/philox4x32_10.hpp @@ -220,6 +220,9 @@ class philox4x32_10_generator_template : public generator_impl_base using base_type = generator_impl_base; using engine_type = philox4x32_10_device_engine; using system_type = System; + using poisson_distribution_manager_t + = poisson_distribution_manager; + using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; philox4x32_10_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -272,6 +275,17 @@ class philox4x32_10_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status set_stream(hipStream_t stream) + { + const rocrand_status status = m_poisson.set_stream(stream); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + base_type::set_stream(stream); + return ROCRAND_STATUS_SUCCESS; + } + rocrand_status init() { if(m_engines_initialized) @@ -279,6 +293,12 @@ class philox4x32_10_generator_template : public generator_impl_base m_engine = engine_type{m_seed, 0, m_offset}; + rocrand_status status = m_poisson.init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + m_engines_initialized = true; return ROCRAND_STATUS_SUCCESS; } @@ -370,15 +390,12 @@ class philox4x32_10_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int* data, size_t data_size, double lambda) { - try - { - m_poisson.set_lambda(lambda); - } - catch(rocrand_status status) + auto dis = m_poisson.get_distribution(lambda); + if(auto* error_status = std::get_if(&dis)) { - return status; + return *error_status; } - return generate(data, data_size, m_poisson.dis); + return generate(data, data_size, std::get(dis)); } private: @@ -388,7 +405,7 @@ class philox4x32_10_generator_template : public generator_impl_base unsigned long long m_seed; // For caching of Poisson for consecutive generations with the same lambda - poisson_distribution_manager m_poisson; + poisson_distribution_manager_t m_poisson; // m_seed from base_type // m_offset from base_type diff --git a/library/src/rng/sobol.hpp b/library/src/rng/sobol.hpp index 9ebce5177..84a90b85c 100644 --- a/library/src/rng/sobol.hpp +++ b/library/src/rng/sobol.hpp @@ -494,6 +494,9 @@ class sobol_generator_template : public generator_impl_base using engine_type = sobol_device_engine_t; using constant_type = std::conditional_t; using constant_accessor = sobol_constant_accessor; + using poisson_distribution_manager_t + = poisson_distribution_manager; + using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; sobol_generator_template(unsigned long long offset = 0, rocrand_ordering order = ROCRAND_ORDERING_QUASI_DEFAULT, @@ -578,6 +581,17 @@ class sobol_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status set_stream(hipStream_t stream) + { + const rocrand_status status = m_poisson.set_stream(stream); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + base_type::set_stream(stream); + return ROCRAND_STATUS_SUCCESS; + } + rocrand_status init() { if(m_initialized) @@ -585,6 +599,12 @@ class sobol_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status status = m_poisson.init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + m_current_offset = static_cast(m_offset); m_initialized = true; @@ -715,15 +735,12 @@ class sobol_generator_template : public generator_impl_base { static_assert(Is64 || std::is_same_v, "The 32 bit sobol generator can only generate 32bit poisson"); - try - { - m_poisson.set_lambda(lambda); - } - catch(rocrand_status status) + auto dis = m_poisson.get_distribution(lambda); + if(auto* error_status = std::get_if(&dis)) { - return status; + return *error_status; } - return generate(data, data_size, m_poisson.dis); + return generate(data, data_size, std::get(dis)); } private: @@ -742,7 +759,7 @@ class sobol_generator_template : public generator_impl_base const constant_type* m_scramble_constants = nullptr; // For caching of Poisson for consecutive generations with the same lambda - poisson_distribution_manager m_poisson; + poisson_distribution_manager_t m_poisson; // m_offset from base_type diff --git a/library/src/rng/system.hpp b/library/src/rng/system.hpp index 83608f312..fee270430 100644 --- a/library/src/rng/system.hpp +++ b/library/src/rng/system.hpp @@ -49,6 +49,29 @@ namespace rocrand_impl::system { +namespace detail +{ + +inline rocrand_status is_stream_blocking(hipStream_t stream, bool& is_blocking) +{ + if(stream) + { + unsigned int stream_flags; + const hipError_t error = hipStreamGetFlags(stream, &stream_flags); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + is_blocking = (stream_flags & hipStreamNonBlocking) == 0; + } + else + { + is_blocking = true; + } + return ROCRAND_STATUS_SUCCESS; +} + +} // namespace detail /// \tparam UseHostFunc If true, launching will enqueue the kernel in the stream. Otherwise, /// execute the kernel synchronously. @@ -189,6 +212,45 @@ struct host_system return ROCRAND_STATUS_SUCCESS; } + + static rocrand_status + launch_host_func([[maybe_unused]] hipStream_t stream, hipHostFn_t fn, void* userData) + { + if constexpr(UseHostFunc) + { + const hipError_t error = hipLaunchHostFunc(stream, fn, userData); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + } + else + { + try + { + fn(userData); + } + catch(...) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + } + return ROCRAND_STATUS_SUCCESS; + } + + static rocrand_status is_host_func_blocking([[maybe_unused]] hipStream_t stream, + bool& is_blocking) + { + if constexpr(UseHostFunc) + { + return detail::is_stream_blocking(stream, is_blocking); + } + else + { + is_blocking = true; + } + return ROCRAND_STATUS_SUCCESS; + } }; namespace detail @@ -257,6 +319,21 @@ struct device_system } return ROCRAND_STATUS_SUCCESS; } + + static rocrand_status launch_host_func(hipStream_t stream, hipHostFn_t fn, void* userData) + { + const hipError_t error = hipLaunchHostFunc(stream, fn, userData); + if(error != hipSuccess) + { + return ROCRAND_STATUS_INTERNAL_ERROR; + } + return ROCRAND_STATUS_SUCCESS; + } + + static rocrand_status is_host_func_blocking(hipStream_t stream, bool& is_blocking) + { + return detail::is_stream_blocking(stream, is_blocking); + } }; template diff --git a/library/src/rng/threefry.hpp b/library/src/rng/threefry.hpp index f2b379e86..bcf589838 100644 --- a/library/src/rng/threefry.hpp +++ b/library/src/rng/threefry.hpp @@ -205,6 +205,9 @@ class threefry_generator_template : public generator_impl_base using engine_type = Engine; using scalar_type = typename engine_type::scalar_type; using system_type = System; + using poisson_distribution_manager_t + = poisson_distribution_manager; + using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; threefry_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -277,13 +280,32 @@ class threefry_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status set_stream(hipStream_t stream) + { + const rocrand_status status = m_poisson.set_stream(stream); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + base_type::set_stream(stream); + return ROCRAND_STATUS_SUCCESS; + } + rocrand_status init() { if(m_engines_initialized) + { return ROCRAND_STATUS_SUCCESS; + } m_engine = engine_type{m_seed, 0, m_offset}; + rocrand_status status = m_poisson.init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + m_engines_initialized = true; return ROCRAND_STATUS_SUCCESS; } @@ -384,15 +406,12 @@ class threefry_generator_template : public generator_impl_base template rocrand_status generate_poisson(T* data, size_t data_size, double lambda) { - try - { - m_poisson.set_lambda(lambda); - } - catch(rocrand_status status) + auto dis = m_poisson.get_distribution(lambda); + if(auto* error_status = std::get_if(&dis)) { - return status; + return *error_status; } - return generate(data, data_size, m_poisson.dis); + return generate(data, data_size, std::get(dis)); } private: @@ -402,7 +421,7 @@ class threefry_generator_template : public generator_impl_base unsigned long long m_seed; // For caching of Poisson for consecutive generations with the same lambda - poisson_distribution_manager m_poisson; + poisson_distribution_manager_t m_poisson; // m_seed from base_type // m_offset from base_type diff --git a/library/src/rng/xorwow.hpp b/library/src/rng/xorwow.hpp index 23af661e3..1c518c987 100644 --- a/library/src/rng/xorwow.hpp +++ b/library/src/rng/xorwow.hpp @@ -170,6 +170,9 @@ class xorwow_generator_template : public generator_impl_base using base_type = generator_impl_base; using engine_type = xorwow_device_engine; using system_type = System; + using poisson_distribution_manager_t + = poisson_distribution_manager; + using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; xorwow_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -265,6 +268,17 @@ class xorwow_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status set_stream(hipStream_t stream) + { + const rocrand_status status = m_poisson.set_stream(stream); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + base_type::set_stream(stream); + return ROCRAND_STATUS_SUCCESS; + } + rocrand_status init() { if (m_engines_initialized) @@ -310,6 +324,12 @@ class xorwow_generator_template : public generator_impl_base return status; } + status = m_poisson.init(); + if(status != ROCRAND_STATUS_SUCCESS) + { + return status; + } + m_engines_initialized = true; return ROCRAND_STATUS_SUCCESS; } @@ -408,15 +428,12 @@ class xorwow_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int * data, size_t data_size, double lambda) { - try - { - m_poisson.set_lambda(lambda); - } - catch(rocrand_status status) + auto dis = m_poisson.get_distribution(lambda); + if(auto* error_status = std::get_if(&dis)) { - return status; + return *error_status; } - return generate(data, data_size, m_poisson.dis); + return generate(data, data_size, std::get(dis)); } private: @@ -428,7 +445,7 @@ class xorwow_generator_template : public generator_impl_base unsigned long long m_seed; // For caching of Poisson for consecutive generations with the same lambda - poisson_distribution_manager m_poisson; + poisson_distribution_manager_t m_poisson; // m_seed from base_type // m_offset from base_type diff --git a/library/src/rocrand.cpp b/library/src/rocrand.cpp index 7c29f5a29..76b669b66 100644 --- a/library/src/rocrand.cpp +++ b/library/src/rocrand.cpp @@ -20,6 +20,8 @@ #include +#include "rng/distribution/discrete.hpp" +#include "rng/distribution/poisson.hpp" #include "rng/generator_type.hpp" #include "rng/generator_types.hpp" @@ -391,8 +393,7 @@ rocrand_status ROCRANDAPI rocrand_set_stream(rocrand_generator generator, hipStr return ROCRAND_STATUS_NOT_CREATED; } - generator->set_stream(stream); - return ROCRAND_STATUS_SUCCESS; + return generator->set_stream(stream); } rocrand_status ROCRANDAPI rocrand_set_seed(rocrand_generator generator, unsigned long long seed) @@ -471,22 +472,23 @@ rocrand_status ROCRANDAPI rocrand_create_poisson_distribution( return ROCRAND_STATUS_OUT_OF_RANGE; } - poisson_distribution h_dis; - try - { - h_dis = poisson_distribution(lambda); - } - catch(const std::exception& e) - { - return ROCRAND_STATUS_INTERNAL_ERROR; - } - catch(rocrand_status status) + unsigned int size; + unsigned int offset; + const std::vector poisson_probabilities + = calculate_poisson_probabilities(lambda, size, offset); + + rocrand_discrete_distribution_st h_dis; + rocrand_status status + = discrete_distribution_factory::create(poisson_probabilities, + size, + offset, + h_dis); + if(status != ROCRAND_STATUS_SUCCESS) { return status; } - hipError_t error; - error = hipMalloc(discrete_distribution, sizeof(rocrand_discrete_distribution_st)); + hipError_t error = hipMalloc(discrete_distribution, sizeof(rocrand_discrete_distribution_st)); if(error != hipSuccess) { return ROCRAND_STATUS_ALLOCATION_FAILED; @@ -519,16 +521,13 @@ rocrand_status ROCRANDAPI return ROCRAND_STATUS_OUT_OF_RANGE; } - discrete_distribution_base h_dis; - try - { - h_dis = discrete_distribution_base(probabilities, size, offset); - } - catch(const std::exception& e) - { - return ROCRAND_STATUS_INTERNAL_ERROR; - } - catch(rocrand_status status) + rocrand_discrete_distribution_st h_dis; + rocrand_status status = discrete_distribution_factory::create( + std::vector(probabilities, probabilities + size), + size, + offset, + h_dis); + if(status != ROCRAND_STATUS_SUCCESS) { return status; } @@ -560,7 +559,7 @@ rocrand_status ROCRANDAPI return ROCRAND_STATUS_OUT_OF_RANGE; } - discrete_distribution_base h_dis; + rocrand_discrete_distribution_st h_dis; hipError_t error; error = hipMemcpy(&h_dis, @@ -572,11 +571,9 @@ rocrand_status ROCRANDAPI return ROCRAND_STATUS_INTERNAL_ERROR; } - try - { - h_dis.deallocate(); - } - catch(rocrand_status status) + const rocrand_status status + = discrete_distribution_factory::deallocate(h_dis); + if(status != ROCRAND_STATUS_SUCCESS) { return status; } diff --git a/test/internal/test_poisson_distribution.cpp b/test/internal/test_poisson_distribution.cpp index 2b3ce0207..7d4109af4 100644 --- a/test/internal/test_poisson_distribution.cpp +++ b/test/internal/test_poisson_distribution.cpp @@ -18,6 +18,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. +#include "test_common.hpp" +#include "test_rocrand_common.hpp" #include #include @@ -28,29 +30,6 @@ using namespace rocrand_impl::host; -template -double get_mean(std::vector values) -{ - double mean = 0.0f; - for(auto v : values) - { - mean += static_cast(v); - } - return mean / values.size(); -} - -template -double get_variance(std::vector values, double mean) -{ - double variance = 0.0f; - for(auto v : values) - { - const double x = static_cast(v) - mean; - variance += x * x; - } - return variance / values.size(); -} - class poisson_distribution_tests : public ::testing::TestWithParam {}; @@ -61,19 +40,28 @@ TEST_P(poisson_distribution_tests, mean_var) std::random_device rd; std::mt19937 gen(rd()); - poisson_distribution dis; - dis.set_lambda(lambda); + using distribution_factory_t = discrete_distribution_factory; + + unsigned int size; + unsigned int offset; + const std::vector poisson_probabilities + = calculate_poisson_probabilities(lambda, size, offset); + rocrand_discrete_distribution_st discrete_dist; + ROCRAND_CHECK( + distribution_factory_t::create(poisson_probabilities, size, offset, discrete_dist)); + + poisson_distribution dis(discrete_dist, lambda); const size_t samples_count = static_cast(std::max(2.0, sqrt(lambda))) * 100000; std::vector values(samples_count); for(size_t si = 0; si < samples_count; si++) { - const unsigned int v = dis(gen()); + const unsigned int v = dis(static_cast(gen())); values[si] = v; } - dis.deallocate(); + distribution_factory_t::deallocate(discrete_dist); const double mean = get_mean(values); const double variance = get_variance(values, mean); @@ -90,8 +78,16 @@ TEST_P(poisson_distribution_tests, histogram_compare) SCOPED_TRACE(testing::Message() << "with seed = " << seed); std::mt19937 gen(seed); - poisson_distribution dis; - dis.set_lambda(lambda); + using distribution_factory_t = discrete_distribution_factory; + unsigned int size; + unsigned int offset; + const std::vector poisson_probabilities + = calculate_poisson_probabilities(lambda, size, offset); + rocrand_discrete_distribution_st discrete_dist; + ROCRAND_CHECK( + distribution_factory_t::create(poisson_probabilities, size, offset, discrete_dist)); + + poisson_distribution dis(discrete_dist, lambda); const size_t samples_count = static_cast(std::max(2.0, sqrt(lambda))) * 100000; const size_t bin_size = static_cast(std::max(2.0, sqrt(lambda))); @@ -100,7 +96,7 @@ TEST_P(poisson_distribution_tests, histogram_compare) for(size_t si = 0; si < samples_count; si++) { - const unsigned int v = dis(gen()); + const unsigned int v = dis(static_cast(gen())); const size_t bin = v / bin_size; if(bin < bins_count) { @@ -108,7 +104,7 @@ TEST_P(poisson_distribution_tests, histogram_compare) } } - dis.deallocate(); + distribution_factory_t::deallocate(discrete_dist); // for small lambda, histogram test is inaccurate due to relatively large bins // for large lambda, expected value calculation is inaccurate due to non-finite terms diff --git a/test/internal/test_rocrand_generator_type.cpp b/test/internal/test_rocrand_generator_type.cpp index 7bf5fe0bd..64b50750d 100644 --- a/test/internal/test_rocrand_generator_type.cpp +++ b/test/internal/test_rocrand_generator_type.cpp @@ -63,6 +63,12 @@ struct dummy_generator : generator_impl_base return ROCRAND_STATUS_SUCCESS; } + rocrand_status set_stream(hipStream_t stream) + { + generator_impl_base::set_stream(stream); + return ROCRAND_STATUS_SUCCESS; + } + rocrand_status init() { return ROCRAND_STATUS_SUCCESS; diff --git a/test/test_common.hpp b/test/test_common.hpp index 57b88edfb..8e19d0a05 100644 --- a/test/test_common.hpp +++ b/test/test_common.hpp @@ -137,4 +137,27 @@ void assert_near(const std::vector& a, const std::vector& b, double eps) } } +template +double get_mean(const std::vector& values) +{ + double mean = 0.0f; + for(auto v : values) + { + mean += static_cast(v); + } + return mean / values.size(); +} + +template +double get_variance(const std::vector& values, double mean) +{ + double variance = 0.0f; + for(auto v : values) + { + const double x = static_cast(v) - mean; + variance += x * x; + } + return variance / values.size(); +} + #endif // TEST_COMMON_HPP_ From ca1020682fcae5d83ff8a11c9eeaf21b4ed1d61b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Thu, 16 May 2024 12:38:42 +0000 Subject: [PATCH 34/54] use ROCRAND_HIP_FATAL_ASSERT for hipDeviceSynchronize call --- library/src/rng/system.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/library/src/rng/system.hpp b/library/src/rng/system.hpp index 7d692c8bd..83608f312 100644 --- a/library/src/rng/system.hpp +++ b/library/src/rng/system.hpp @@ -80,11 +80,7 @@ struct host_system template static void free(T* ptr) { - hipError_t status = hipDeviceSynchronize(); - if(status != hipSuccess) - { - return; - } + ROCRAND_HIP_FATAL_ASSERT(hipDeviceSynchronize()); delete[] ptr; } From 472fd0f80f2488ee9be365a108cf74eb99e9c57f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Wed, 22 May 2024 11:59:13 +0000 Subject: [PATCH 35/54] generate_poisson test with many lambdas --- test/test_rocrand_generate_poisson.cpp | 82 +++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/test/test_rocrand_generate_poisson.cpp b/test/test_rocrand_generate_poisson.cpp index a0d140aba..4c7befb4e 100644 --- a/test/test_rocrand_generate_poisson.cpp +++ b/test/test_rocrand_generate_poisson.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -18,7 +18,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include #include #include @@ -27,6 +26,9 @@ #include "test_common.hpp" #include "test_rocrand_common.hpp" +#include +#include + class rocrand_generate_poisson_tests : public ::testing::TestWithParam {}; @@ -70,6 +72,69 @@ void test_out_of_range(GenerateFunc generate_func) ROCRAND_CHECK(rocrand_destroy_generator(generator)); } +void test_multiple_lambdas(const rocrand_rng_type rng_type, const hipStream_t stream) +{ + rocrand_generator generator; + ROCRAND_CHECK(rocrand_create_generator(&generator, rng_type)); + ROCRAND_CHECK(rocrand_set_stream(generator, stream)); + + constexpr size_t num_invocations = 20; + constexpr size_t size = 125638; + constexpr double min_lambda = 0.001; + constexpr double max_lambda = 5000; + constexpr unsigned int seed = 654321; + + std::uniform_real_distribution lambda_distribution(min_lambda, max_lambda); + std::default_random_engine rng(seed); + std::vector lambdas(num_invocations); + for(auto& lambda : lambdas) + { + lambda = lambda_distribution(rng); + } + + std::vector d_results(num_invocations); + for(auto& d_ptr : d_results) + { + HIP_CHECK(hipMallocHelper(&d_ptr, sizeof(*d_ptr) * size)); + } + + std::vector> h_results(num_invocations); + for(auto& h_vec : h_results) + { + h_vec.resize(size); + } + + for(size_t i = 0; i < num_invocations; ++i) + { + ROCRAND_CHECK(rocrand_generate_poisson(generator, d_results[i], size, lambdas[i])); + } + + HIP_CHECK(hipStreamSynchronize(stream)); + + for(size_t i = 0; i < num_invocations; ++i) + { + const auto lambda = lambdas[i]; + auto& values = h_results[i]; + HIP_CHECK(hipMemcpy(values.data(), + d_results[i], + sizeof(*d_results[i]) * size, + hipMemcpyDeviceToHost)); + + const double mean = get_mean(values); + const double variance = get_variance(values, mean); + + EXPECT_NEAR(mean, lambda, std::max(1.0, lambda * 3e-2)); + EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 2e-2)); + } + + for(auto* d_ptr : d_results) + { + HIP_CHECK(hipFree(d_ptr)); + } + + ROCRAND_CHECK(rocrand_destroy_generator(generator)); +} + TEST_P(rocrand_generate_poisson_tests, generate_test) { test_generate( @@ -97,6 +162,19 @@ TEST_P(rocrand_generate_poisson_tests, out_of_range_test) { return rocrand_generate_poisson(gen, data, size, lambda); }); } +TEST_P(rocrand_generate_poisson_tests, multiple_lambdas_default_stream) +{ + test_multiple_lambdas(GetParam(), hipStreamDefault); +} + +TEST_P(rocrand_generate_poisson_tests, multiple_lambdas_non_blocking_stream) +{ + hipStream_t stream; + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + test_multiple_lambdas(GetParam(), stream); + HIP_CHECK(hipStreamDestroy(stream)); +} + INSTANTIATE_TEST_SUITE_P(rocrand_generate_poisson_tests, rocrand_generate_poisson_tests, ::testing::ValuesIn(rng_types)); From c27eece30a3705790123ae43850788cd977d7e29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Wed, 22 May 2024 11:59:31 +0000 Subject: [PATCH 36/54] Testing poisson with hipGraphs --- test/test_rocrand_hipgraphs.cpp | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/test/test_rocrand_hipgraphs.cpp b/test/test_rocrand_hipgraphs.cpp index d6a671446..12e1d64f4 100644 --- a/test/test_rocrand_hipgraphs.cpp +++ b/test/test_rocrand_hipgraphs.cpp @@ -140,6 +140,50 @@ TEST_P(rocrand_hipgraph_generate_tests, uniform_float_test) HIP_CHECK(hipStreamDestroy(stream)); } +TEST_P(rocrand_hipgraph_generate_tests, poisson_test) +{ + const rocrand_rng_type rng_type = GetParam(); + + rocrand_generator generator; + ROCRAND_CHECK(rocrand_create_generator(&generator, rng_type)); + + ROCRAND_CHECK(rocrand_initialize_generator(generator)); + + constexpr size_t size = 12563; + unsigned int* data; + HIP_CHECK(hipMallocHelper(&data, size * sizeof(*data))); + HIP_CHECK(hipDeviceSynchronize()); + + // Default stream does not support hipGraph stream capture, so create a non-blocking one + hipStream_t stream = 0; + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + rocrand_set_stream(generator, stream); + + hipGraphExec_t graph_instance; + hipGraph_t graph = test_utils::createGraphHelper(stream); + + // Any sizes + ROCRAND_CHECK(rocrand_generate_poisson(generator, data, 1, 10.0)); + + graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + test_utils::resetGraphHelper(graph, graph_instance, stream); + + // Any alignment + ROCRAND_CHECK(rocrand_generate_poisson(generator, data + 1, 2, 500.0)); + + graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + test_utils::resetGraphHelper(graph, graph_instance, stream); + + ROCRAND_CHECK(rocrand_generate_poisson(generator, data, size, 5000.0)); + + graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + + HIP_CHECK(hipFree(data)); + ROCRAND_CHECK(rocrand_destroy_generator(generator)); + test_utils::cleanupGraphHelper(graph, graph_instance); + HIP_CHECK(hipStreamDestroy(stream)); +} + INSTANTIATE_TEST_SUITE_P(rocrand_hipgraph_generate_tests, rocrand_hipgraph_generate_tests, ::testing::ValuesIn(rng_types)); From 013014123bb78952879c13e5cd47aa887e82e5a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Wed, 22 May 2024 12:00:15 +0000 Subject: [PATCH 37/54] Test [blocking] host_generator with non-blocking stream --- test/test_rocrand_host.cpp | 197 +++++++++++++++++++++++++------------ 1 file changed, 134 insertions(+), 63 deletions(-) diff --git a/test/test_rocrand_host.cpp b/test/test_rocrand_host.cpp index cb7d810a7..f6cdcdf00 100644 --- a/test/test_rocrand_host.cpp +++ b/test/test_rocrand_host.cpp @@ -47,39 +47,99 @@ std::vector get_seeds() return ret; } -constexpr rocrand_rng_type host_rng_types[] = { - ROCRAND_RNG_PSEUDO_PHILOX4_32_10, - ROCRAND_RNG_PSEUDO_LFSR113, - ROCRAND_RNG_PSEUDO_MRG31K3P, - ROCRAND_RNG_PSEUDO_MRG32K3A, - ROCRAND_RNG_PSEUDO_MT19937, - ROCRAND_RNG_PSEUDO_MTGP32, - ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, - ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, - ROCRAND_RNG_PSEUDO_THREEFRY4_32_20, - ROCRAND_RNG_PSEUDO_THREEFRY4_64_20, - ROCRAND_RNG_PSEUDO_XORWOW, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64, - ROCRAND_RNG_QUASI_SOBOL32, - ROCRAND_RNG_QUASI_SOBOL64, +struct host_test_params +{ + rocrand_rng_type rng_type; + bool blocking_host_generator; + bool use_default_stream; + + friend std::ostream& operator<<(std::ostream& os, const host_test_params& params) + { + os << "{ " + << "rng_type: " << params.rng_type << ", blocking: " << params.blocking_host_generator + << ", default_stream: " << params.use_default_stream << " }"; + return os; + } }; -} // namespace +constexpr host_test_params host_test_params_array[] = { + { ROCRAND_RNG_PSEUDO_PHILOX4_32_10, false, true}, + { ROCRAND_RNG_PSEUDO_LFSR113, false, true}, + { ROCRAND_RNG_PSEUDO_MRG31K3P, false, true}, + { ROCRAND_RNG_PSEUDO_MRG32K3A, false, true}, + { ROCRAND_RNG_PSEUDO_MT19937, false, true}, + { ROCRAND_RNG_PSEUDO_MTGP32, false, true}, + { ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, false, true}, + { ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, false, true}, + { ROCRAND_RNG_PSEUDO_THREEFRY4_32_20, false, true}, + { ROCRAND_RNG_PSEUDO_THREEFRY4_64_20, false, true}, + { ROCRAND_RNG_PSEUDO_XORWOW, false, true}, + {ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, false, true}, + {ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64, false, true}, + { ROCRAND_RNG_QUASI_SOBOL32, false, true}, + { ROCRAND_RNG_QUASI_SOBOL64, false, true}, + + { ROCRAND_RNG_PSEUDO_XORWOW, false, false}, + { ROCRAND_RNG_PSEUDO_XORWOW, true, false}, + { ROCRAND_RNG_PSEUDO_XORWOW, true, true}, + + { ROCRAND_RNG_QUASI_SOBOL32, false, false}, + { ROCRAND_RNG_QUASI_SOBOL32, true, false}, + { ROCRAND_RNG_QUASI_SOBOL32, true, true}, +}; -class rocrand_generate_host_test : public ::testing::TestWithParam -{}; +} // namespace -void test_int(const rocrand_rng_type rng_type, const size_t test_size) +class rocrand_generate_host_test : public ::testing::TestWithParam { - if(rng_type == ROCRAND_RNG_PSEUDO_MT19937) +protected: + void SetUp() override { - ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED(); + if(GetParam().rng_type == ROCRAND_RNG_PSEUDO_MT19937) + { + ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED(); + } + if(!GetParam().use_default_stream) + { + HIP_CHECK(hipStreamCreateWithFlags(&m_custom_stream, hipStreamNonBlocking)); + } + } + + void TearDown() override + { + if(!GetParam().use_default_stream) + { + HIP_CHECK(hipStreamDestroy(m_custom_stream)); + } + } + + rocrand_generator get_generator() + { + const auto params = GetParam(); + rocrand_generator generator; + if(params.blocking_host_generator) + { + EXPECT_EQ(ROCRAND_STATUS_SUCCESS, + rocrand_create_generator_host_blocking(&generator, params.rng_type)); + } + else + { + EXPECT_EQ(ROCRAND_STATUS_SUCCESS, + rocrand_create_generator_host(&generator, params.rng_type)); + } + if(!params.use_default_stream) + { + EXPECT_EQ(ROCRAND_STATUS_SUCCESS, rocrand_set_stream(generator, m_custom_stream)); + } + return generator; } - rocrand_generator generator; - ROCRAND_CHECK(rocrand_create_generator_host(&generator, rng_type)); +private: + hipStream_t m_custom_stream; +}; +void test_int(rocrand_generator generator, const size_t test_size) +{ std::vector results(test_size); for(size_t i = 0; i < seeds_count + random_seeds_count; ++i) { @@ -109,29 +169,24 @@ void test_int(const rocrand_rng_type rng_type, const size_t test_size) TEST_P(rocrand_generate_host_test, int_test) { - test_int(GetParam(), 11111); + test_int(get_generator(), 11111); } TEST_P(rocrand_generate_host_test, int_test_large) { ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED(); constexpr size_t large_test_size = size_t(INT_MAX) + 1; - test_int(GetParam(), large_test_size); + test_int(get_generator(), large_test_size); } template -void test_int_parity(rocrand_rng_type rng_type, +void test_int_parity(rocrand_generator host_generator, + rocrand_rng_type rng_type, F generate, const std::vector& seeds = get_seeds()) { - if(rng_type == ROCRAND_RNG_PSEUDO_MT19937) - { - ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED(); - } - - rocrand_generator device_generator, host_generator; + rocrand_generator device_generator; ROCRAND_CHECK(rocrand_create_generator(&device_generator, rng_type)); - ROCRAND_CHECK(rocrand_create_generator_host(&host_generator, rng_type)); std::vector host_results(218192); std::vector device_results(host_results.size()); @@ -152,6 +207,7 @@ void test_int_parity(rocrand_rng_type rng_type, output, host_results.size() * sizeof(Type), hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); assert_eq(host_results, device_results); } @@ -163,32 +219,27 @@ void test_int_parity(rocrand_rng_type rng_type, TEST_P(rocrand_generate_host_test, char_parity_test) { - test_int_parity(GetParam(), rocrand_generate_char); + test_int_parity(get_generator(), GetParam().rng_type, rocrand_generate_char); } TEST_P(rocrand_generate_host_test, short_parity_test) { - test_int_parity(GetParam(), rocrand_generate_short); + test_int_parity(get_generator(), GetParam().rng_type, rocrand_generate_short); } TEST_P(rocrand_generate_host_test, int_parity_test) { - test_int_parity(GetParam(), rocrand_generate); + test_int_parity(get_generator(), GetParam().rng_type, rocrand_generate); } template -void test_uniform_parity(rocrand_rng_type rng_type, +void test_uniform_parity(rocrand_generator host_generator, + rocrand_rng_type rng_type, F generate, const std::vector& seeds = get_seeds()) { - if(rng_type == ROCRAND_RNG_PSEUDO_MT19937) - { - ROCRAND_SKIP_SLOW_TEST_IF_NOT_ENABLED(); - } - - rocrand_generator device_generator, host_generator; + rocrand_generator device_generator; ROCRAND_CHECK(rocrand_create_generator(&device_generator, rng_type)); - ROCRAND_CHECK(rocrand_create_generator_host(&host_generator, rng_type)); std::vector host_results(218192); std::vector device_results(host_results.size()); @@ -209,6 +260,7 @@ void test_uniform_parity(rocrand_rng_type rng_type, output, host_results.size() * sizeof(Type), hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); assert_eq(host_results, device_results); } @@ -220,21 +272,24 @@ void test_uniform_parity(rocrand_rng_type rng_type, TEST_P(rocrand_generate_host_test, uniform_half_parity_test) { - test_uniform_parity(GetParam(), rocrand_generate_uniform_half); + test_uniform_parity(get_generator(), GetParam().rng_type, rocrand_generate_uniform_half); } TEST_P(rocrand_generate_host_test, uniform_float_parity_test) { - test_uniform_parity(GetParam(), rocrand_generate_uniform); + test_uniform_parity(get_generator(), GetParam().rng_type, rocrand_generate_uniform); } TEST_P(rocrand_generate_host_test, uniform_double_parity_test) { - test_uniform_parity(GetParam(), rocrand_generate_uniform_double); + test_uniform_parity(get_generator(), + GetParam().rng_type, + rocrand_generate_uniform_double); } template -void test_normal_parity(rocrand_rng_type rng_type, +void test_normal_parity(rocrand_generator host_generator, + rocrand_rng_type rng_type, F generate, double eps, const std::vector& seeds = get_seeds()) @@ -247,9 +302,8 @@ void test_normal_parity(rocrand_rng_type rng_type, Type mean = static_cast(-12.0); Type stddev = static_cast(2.4); - rocrand_generator device_generator, host_generator; + rocrand_generator device_generator; ROCRAND_CHECK(rocrand_create_generator(&device_generator, rng_type)); - ROCRAND_CHECK(rocrand_create_generator_host(&host_generator, rng_type)); std::vector host_results(218192); std::vector device_results(host_results.size()); @@ -271,6 +325,7 @@ void test_normal_parity(rocrand_rng_type rng_type, output, host_results.size() * sizeof(Type), hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); // This rounding is required because the sine and cosine used in box-muller used in the normal // distribution is slightly different from the one used on the host. @@ -284,43 +339,58 @@ void test_normal_parity(rocrand_rng_type rng_type, TEST_P(rocrand_generate_host_test, normal_half_parity_test) { - test_normal_parity(GetParam(), rocrand_generate_normal_half, 0.1); + test_normal_parity(get_generator(), + GetParam().rng_type, + rocrand_generate_normal_half, + 0.1); } TEST_P(rocrand_generate_host_test, normal_float_parity_test) { - test_normal_parity(GetParam(), rocrand_generate_normal, 0.005); + test_normal_parity(get_generator(), GetParam().rng_type, rocrand_generate_normal, 0.005); } TEST_P(rocrand_generate_host_test, normal_double_parity_test) { - test_normal_parity(GetParam(), rocrand_generate_normal_double, 0.000001); + test_normal_parity(get_generator(), + GetParam().rng_type, + rocrand_generate_normal_double, + 0.000001); } TEST_P(rocrand_generate_host_test, log_normal_half_parity_test) { - test_normal_parity(GetParam(), rocrand_generate_log_normal_half, 0.05); + test_normal_parity(get_generator(), + GetParam().rng_type, + rocrand_generate_log_normal_half, + 0.05); } TEST_P(rocrand_generate_host_test, log_normal_float_parity_test) { - test_normal_parity(GetParam(), rocrand_generate_log_normal, 0.0001); + test_normal_parity(get_generator(), + GetParam().rng_type, + rocrand_generate_log_normal, + 0.0001); } TEST_P(rocrand_generate_host_test, log_normal_double_parity_test) { - test_normal_parity(GetParam(), rocrand_generate_log_normal_double, 0.0000001); + test_normal_parity(get_generator(), + GetParam().rng_type, + rocrand_generate_log_normal_double, + 0.0000001); } TEST_P(rocrand_generate_host_test, poisson_parity_test) { - const rocrand_rng_type rng_type = GetParam(); + const rocrand_rng_type rng_type = GetParam().rng_type; using Type = unsigned int; double lambda = 1.1; - rocrand_generator device_generator, host_generator; + rocrand_generator host_generator = get_generator(); + rocrand_generator device_generator; ROCRAND_CHECK(rocrand_create_generator(&device_generator, rng_type)); - ROCRAND_CHECK(rocrand_create_generator_host(&host_generator, rng_type)); std::vector host_results(218192); std::vector device_results(host_results.size()); @@ -345,9 +415,10 @@ TEST_P(rocrand_generate_host_test, poisson_parity_test) output, host_results.size() * sizeof(Type), hipMemcpyDeviceToHost)); - } + HIP_CHECK(hipDeviceSynchronize()); - assert_eq(host_results, device_results); + assert_eq(host_results, device_results); + } ROCRAND_CHECK(rocrand_destroy_generator(host_generator)); ROCRAND_CHECK(rocrand_destroy_generator(device_generator)); @@ -356,4 +427,4 @@ TEST_P(rocrand_generate_host_test, poisson_parity_test) INSTANTIATE_TEST_SUITE_P(rocrand_generate_host_test, rocrand_generate_host_test, - ::testing::ValuesIn(host_rng_types)); + ::testing::ValuesIn(host_test_params_array)); From 8c83427f62a4f84d3bdf9eb34ec83fd6447142f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Wed, 22 May 2024 13:01:26 +0000 Subject: [PATCH 38/54] Fixing poisson distribution selection in benchmark_tuning --- benchmark/tuning/benchmark_tuning.hpp | 9 +++-- benchmark/tuning/distribution_traits.hpp | 51 +++++++++++++++++++++++- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/benchmark/tuning/benchmark_tuning.hpp b/benchmark/tuning/benchmark_tuning.hpp index 271ebaed7..dfcaa3e8d 100644 --- a/benchmark/tuning/benchmark_tuning.hpp +++ b/benchmark/tuning/benchmark_tuning.hpp @@ -88,7 +88,10 @@ void run_benchmark(benchmark::State& state, const benchmark_config& config) generator.set_stream(stream); const auto generate_func = [&] - { return generator.generate(data, size, default_distribution{}(config)); }; + { + default_distribution default_distribution_provider; + return generator.generate(data, size, default_distribution_provider(config)); + }; // Warm-up ROCRAND_CHECK(generate_func()); @@ -147,9 +150,7 @@ class generator_benchmark_factory if constexpr(std::is_same_v) { // The poisson distribution is only supported for unsigned int. - using poisson_distribution_t = rocrand_impl::host::poisson_distribution< - rocrand_impl::host::DISCRETE_METHOD_ALIAS>; - add_benchmarks_impl(); + add_benchmarks_impl>(); } } else if constexpr(std::is_floating_point_v || std::is_same_v) diff --git a/benchmark/tuning/distribution_traits.hpp b/benchmark/tuning/distribution_traits.hpp index 87baebcb5..f65ccc357 100644 --- a/benchmark/tuning/distribution_traits.hpp +++ b/benchmark/tuning/distribution_traits.hpp @@ -146,6 +146,15 @@ struct distribution_name< } }; +template<> +struct distribution_name +{ + std::string operator()() + { + return "poisson_unsigned_int"; + } +}; + template struct default_distribution { @@ -191,11 +200,49 @@ struct default_distribution< { auto operator()(const benchmark_config& config) { - return rocrand_impl::host::poisson_distribution( - config.lambda); + return std::get< + rocrand_impl::host::poisson_distribution>( + m_poisson_manager.get_distribution(config.lambda)); + } + +private: + rocrand_impl::host::poisson_distribution_manager + m_poisson_manager; +}; + +template<> +struct default_distribution +{ + auto operator()(const benchmark_config& config) + { + auto poisson_distribution = std::get< + rocrand_impl::host::poisson_distribution>( + m_poisson_manager.get_distribution(config.lambda)); + return rocrand_impl::host::mrg_poisson_distribution(poisson_distribution); } + +private: + rocrand_impl::host::poisson_distribution_manager + m_poisson_manager; }; +template class GeneratorTemplate> +struct select_poisson_distribution +{ + using dummy_generator_t = GeneratorTemplate>; + static constexpr inline rocrand_rng_type rng_type = dummy_generator_t::type(); + static constexpr inline bool is_mrg + = rng_type == ROCRAND_RNG_PSEUDO_MRG31K3P || rng_type == ROCRAND_RNG_PSEUDO_MRG32K3A; + + using type = std::conditional_t< + is_mrg, + rocrand_impl::host::mrg_poisson_distribution, + rocrand_impl::host::poisson_distribution>; +}; + +template class GeneratorTemplate> +using select_poisson_distribution_t = typename select_poisson_distribution::type; + } // namespace benchmark_tuning #endif // ROCRAND_BENCHMARK_TUNING_DISTRIBUTION_TRAITS_HPP_ From ecbd2a712b92c84ea26508a22f2511fd73047607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Thu, 23 May 2024 07:23:05 +0000 Subject: [PATCH 39/54] Updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1761a550f..bd736c772 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Documentation for rocRAND is available at ### Additions * Added host generator for MT19937 +* Support for `rocrand_generate_poisson` in hipGraphs ### Changes From e2c573f87a8e0171d315c645b3fa965d007b5969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Thu, 6 Jun 2024 13:17:07 +0200 Subject: [PATCH 40/54] fix(docs): Added links to unaccessible doc pages --- docs/{ => conceptual}/data-type-support.rst | 6 ++++++ docs/index.rst | 2 ++ docs/sphinx/_toc.yml.in | 2 ++ 3 files changed, 10 insertions(+) rename docs/{ => conceptual}/data-type-support.rst (98%) diff --git a/docs/data-type-support.rst b/docs/conceptual/data-type-support.rst similarity index 98% rename from docs/data-type-support.rst rename to docs/conceptual/data-type-support.rst index 1567c9d4d..619300306 100644 --- a/docs/data-type-support.rst +++ b/docs/conceptual/data-type-support.rst @@ -1,3 +1,9 @@ +.. meta:: + :description: rocRAND documentation and API reference library + :keywords: rocRAND, ROCm, API, documentation, cuRAND + +.. _data-type-support: + Data type support ****************************************** diff --git a/docs/index.rst b/docs/index.rst index 725655662..eba1fb946 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -31,6 +31,7 @@ The documentation is structured as follows: .. grid-item-card:: Conceptual * :ref:`programmers-guide` + * :ref:`data-type-support` * :ref:`curand-compatibility` * :ref:`dynamic-ordering-configuration` @@ -38,6 +39,7 @@ The documentation is structured as follows: * :ref:`cpp-api` * :ref:`python-api` + * :doc:`Fortran API reference ` * :doc:`API library ` To contribute to the documentation, refer to diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 826bc67fd..11dd63904 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -8,12 +8,14 @@ subtrees: - caption: Conceptual entries: - file: conceptual/programmers-guide + - file: conceptual/data-type-support - file: conceptual/curand-compatibility - file: conceptual/dynamic_ordering_configuration - caption: API reference entries: - file: api-reference/cpp-api - file: api-reference/python-api + - file: fortran-api-reference - file: doxygen/html/index - caption: About entries: From 511291a8e6fc725d5c8a511dc3c69b2fbb390036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Thu, 6 Jun 2024 13:17:43 +0200 Subject: [PATCH 41/54] fix(docs): Removed duplicated CUDA Compatibility section from Programmer's guide --- docs/conceptual/programmers-guide.rst | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/docs/conceptual/programmers-guide.rst b/docs/conceptual/programmers-guide.rst index 035d93ecf..023409380 100644 --- a/docs/conceptual/programmers-guide.rst +++ b/docs/conceptual/programmers-guide.rst @@ -153,25 +153,3 @@ All supported orderings for all generators are detailed below: ================================== ==================================================================================================================== `ROCRAND_ORDERING_QUASI_DEFAULT` The (scrambled) 32- and 64-bit sobol quasi-random number generators generated the result from :math:`d` dimensions by flattening them into the output. The result at offset :math:`n` in memory is generated from offset :math:`n\;\mathrm{mod}\; d` in dimension :math:`\lfloor n / d \rfloor`, where :math:`d` is the generator's number of dimensions. ================================== ==================================================================================================================== - -cuRAND Compatibility --------------------- - -The following table shows which rocRAND generators produce the exact same sequence as the equivalent cuRAND generator when using legacy ordering, given the same seed, number of dimensions, and offset. - -.. table:: cuRAND Compatibility - :widths: auto - - ================= ===================== - Generator Compatibile - ================= ===================== - XORWOW No - MRG32K3A No - MTGP32 No - Philox 32x4-10 No - MT19937 No - Sobol32 Yes - Scrambled Sobol32 No - Sobol64 Yes, with same offset - Scrambled Sobol64 No - ================= ===================== From 4b802a7b7725107269448802c69387079b5bff41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Thu, 6 Jun 2024 14:48:12 +0200 Subject: [PATCH 42/54] Added hipGraphs doc and sample --- docs/conceptual/programmers-guide.rst | 8 ++++++ test/hipgraphs_doc_sample.hpp | 37 +++++++++++++++++++++++++++ test/test_rocrand_hipgraphs.cpp | 10 ++++++-- 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 test/hipgraphs_doc_sample.hpp diff --git a/docs/conceptual/programmers-guide.rst b/docs/conceptual/programmers-guide.rst index 023409380..4ebc971ce 100644 --- a/docs/conceptual/programmers-guide.rst +++ b/docs/conceptual/programmers-guide.rst @@ -153,3 +153,11 @@ All supported orderings for all generators are detailed below: ================================== ==================================================================================================================== `ROCRAND_ORDERING_QUASI_DEFAULT` The (scrambled) 32- and 64-bit sobol quasi-random number generators generated the result from :math:`d` dimensions by flattening them into the output. The result at offset :math:`n` in memory is generated from offset :math:`n\;\mathrm{mod}\; d` in dimension :math:`\lfloor n / d \rfloor`, where :math:`d` is the generator's number of dimensions. ================================== ==================================================================================================================== + +Using rocRAND in HIP Graphs +=========================== + +rocRAND supports the capturing of the random number generation with HIP Graphs. However, the construction, initialization and cleanup of the generator objects must take place outside of the recorded section. See the following example (error handling omitted for brevity): + +.. literalinclude:: ../../test/hipgraphs_doc_sample.hpp + :language: c++ diff --git a/test/hipgraphs_doc_sample.hpp b/test/hipgraphs_doc_sample.hpp new file mode 100644 index 000000000..efa637e69 --- /dev/null +++ b/test/hipgraphs_doc_sample.hpp @@ -0,0 +1,37 @@ +size_t size = 1000; +float* data_0; +unsigned int* data_1; + +hipMalloc(&data_0, sizeof(*data_0) * size); +hipMalloc(&data_1, sizeof(*data_1) * size); + +hipGraph_t graph; +hipGraphCreate(&graph, 0); + +hipStream_t stream; +hipStreamCreateWithFlags(&stream, hipStreamNonBlocking); + +rocrand_generator generator; +rocrand_create_generator(&generator, ROCRAND_RNG_PSEUDO_DEFAULT); +rocrand_set_stream(generator, stream); +rocrand_initialize_generator(generator); + +hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal); + +rocrand_generate_normal(generator, data_0, size, 10.0F, 2.0F); +rocrand_generate_poisson(generator, data_1, size, 3); + +hipStreamEndCapture(stream, &graph); + +hipGraphExec_t instance; +hipGraphInstantiate(&instance, graph, nullptr, nullptr, 0); + +hipGraphLaunch(instance, stream); +hipStreamSynchronize(stream); + +hipGraphExecDestroy(instance); +rocrand_destroy_generator(generator); +hipStreamDestroy(stream); +hipGraphDestroy(graph); +hipFree(data_1); +hipFree(data_0); diff --git a/test/test_rocrand_hipgraphs.cpp b/test/test_rocrand_hipgraphs.cpp index 12e1d64f4..3b94186f6 100644 --- a/test/test_rocrand_hipgraphs.cpp +++ b/test/test_rocrand_hipgraphs.cpp @@ -184,6 +184,12 @@ TEST_P(rocrand_hipgraph_generate_tests, poisson_test) HIP_CHECK(hipStreamDestroy(stream)); } +TEST(rocrand_hipgraph_generate_tests, hipgraphs_doc_sample){ + +#include "hipgraphs_doc_sample.hpp" + +} + INSTANTIATE_TEST_SUITE_P(rocrand_hipgraph_generate_tests, - rocrand_hipgraph_generate_tests, - ::testing::ValuesIn(rng_types)); + rocrand_hipgraph_generate_tests, + ::testing::ValuesIn(rng_types)); From a4388af3c27be0a5deaeb70b92c3c2182120725d Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Mon, 17 Jun 2024 13:00:47 +0500 Subject: [PATCH 43/54] Fix performance regression of Poisson distribution Recent changes required for HIP graph support added a new path with approximation of Poisson with normal distribution when lambda is large. However, the decision whether to use the alias/CDF methods or the approximation is made in the kernel for every generated value even though lambda is the same. This change moves it to host side: depending on lambda the kernel is launched with one of two distributions (poisson_distribution or poisson_distribution_huge). --- library/src/rng/distribution/poisson.hpp | 81 +++++++++++++++------ library/src/rng/lfsr113.hpp | 14 +++- library/src/rng/mrg.hpp | 18 +++-- library/src/rng/mt19937.hpp | 14 +++- library/src/rng/mtgp32.hpp | 14 +++- library/src/rng/philox4x32_10.hpp | 14 +++- library/src/rng/sobol.hpp | 14 +++- library/src/rng/threefry.hpp | 14 +++- library/src/rng/xorwow.hpp | 14 +++- test/internal/test_poisson_distribution.cpp | 4 +- 10 files changed, 142 insertions(+), 59 deletions(-) diff --git a/library/src/rng/distribution/poisson.hpp b/library/src/rng/distribution/poisson.hpp index a34e1bd66..8a7773f57 100644 --- a/library/src/rng/distribution/poisson.hpp +++ b/library/src/rng/distribution/poisson.hpp @@ -41,6 +41,11 @@ namespace rocrand_impl::host { +// Precise calculation of Poisson distribution using table methods: +// * the alias method (for PRNGs only as it does not preserve quasi-randomness); +// * the binary search in the CDF (cumulative distribution function) table (suitable for both +// QRNGs and PRNGs but not preferred for PRNGs because the alias method is faster); + template class poisson_distribution : private discrete_distribution_base { @@ -50,22 +55,40 @@ class poisson_distribution : private discrete_distribution_base using base_t = discrete_distribution_base; - poisson_distribution(const rocrand_discrete_distribution_st& distribution, const double lambda) - : base_t(distribution), m_lambda(lambda) + poisson_distribution(const rocrand_discrete_distribution_st& distribution) + : base_t(distribution) {} template __forceinline__ __host__ __device__ unsigned int operator()(T x) const { - if(m_lambda > rocrand_device::detail::lambda_threshold_huge) - { - const double normal_d = rocrand_device::detail::normal_distribution_double(x); - return static_cast(round(sqrt(m_lambda) * normal_d + m_lambda)); - } - else - { - return base_t::operator()(x); - } + return base_t::operator()(x); + } + + template + __forceinline__ __host__ __device__ void operator()(const T (&input)[1], + unsigned int (&output)[1]) const + { + output[0] = (*this)(input[0]); + } +}; + +// Approximation of Poisson distribution with normal distribution when lambda is large + +class poisson_distribution_huge +{ +public: + static constexpr unsigned int input_width = 1; + static constexpr unsigned int output_width = 1; + + poisson_distribution_huge(const double lambda) : m_lambda(lambda), m_sqrt_lambda(sqrt(lambda)) + {} + + template + __forceinline__ __host__ __device__ unsigned int operator()(T x) const + { + const double normal_d = rocrand_device::detail::normal_distribution_double(x); + return static_cast(round(m_sqrt_lambda * normal_d + m_lambda)); } template @@ -77,6 +100,7 @@ class poisson_distribution : private discrete_distribution_base private: double m_lambda; + double m_sqrt_lambda; }; [[nodiscard]] inline std::vector @@ -143,8 +167,9 @@ template; - using distribution_t = poisson_distribution; + using factory_t = discrete_distribution_factory; + using distribution_t = poisson_distribution; + using approx_distribution_t = poisson_distribution_huge; poisson_distribution_manager() = default; @@ -244,7 +269,8 @@ class poisson_distribution_manager return ROCRAND_STATUS_SUCCESS; } - std::variant get_distribution(const double lambda) + std::variant + get_distribution(const double lambda) { if(!m_initialized) { @@ -255,6 +281,11 @@ class poisson_distribution_manager } } + if(lambda > rocrand_device::detail::lambda_threshold_huge) + { + return approx_distribution_t(lambda); + } + std::unique_lock lock(m_mutex, std::defer_lock_t{}); if(!m_is_host_func_blocking) { @@ -262,7 +293,7 @@ class poisson_distribution_manager } const bool changed = lambda != m_lambda; - if(changed && lambda <= rocrand_device::detail::lambda_threshold_huge) + if(changed) { auto arg = std::make_unique( update_discrete_distribution_arg{lambda, this}); @@ -311,7 +342,7 @@ class poisson_distribution_manager rocrand_discrete_distribution_st distribution_copy = m_distribution; calculate_poisson_size(lambda, distribution_copy.size, distribution_copy.offset); - return distribution_t(distribution_copy, lambda); + return distribution_t(distribution_copy); } private: @@ -377,28 +408,30 @@ class poisson_distribution_manager // Mrg32k3a and Mrg31k3p -template +template> struct mrg_engine_poisson_distribution { - using distribution_type = poisson_distribution; - static constexpr unsigned int input_width = 1; + using distribution_type = DistributionType; + + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; distribution_type dis; explicit mrg_engine_poisson_distribution(distribution_type dis) : dis(dis) {} - __host__ __device__ - void operator()(const unsigned int (&input)[1], unsigned int (&output)[1]) const + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], + unsigned int (&output)[1]) const { // Alias method requires x in [0, 1), uint must be in [0, UINT_MAX], // but MRG-based engine's "raw" output is in [1, MRG_M1], // so probabilities are slightly different than expected, // some values can not be generated at all. // Hence the "raw" value is remapped to [0, UINT_MAX]: - unsigned int v - = rocrand_device::detail::mrg_uniform_distribution_uint(input[0]); - output[0] = dis(v); + unsigned int input2[1]; + input2[0] = rocrand_device::detail::mrg_uniform_distribution_uint(input[0]); + dis(input2, output); } }; diff --git a/library/src/rng/lfsr113.hpp b/library/src/rng/lfsr113.hpp index 8473d466d..7b3348dfc 100644 --- a/library/src/rng/lfsr113.hpp +++ b/library/src/rng/lfsr113.hpp @@ -172,6 +172,8 @@ class lfsr113_generator_template : public generator_impl_base using poisson_distribution_manager_t = poisson_distribution_manager; using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; + using poisson_approx_distribution_t = + typename poisson_distribution_manager_t::approx_distribution_t; lfsr113_generator_template(uint4 seeds = {ROCRAND_LFSR113_DEFAULT_SEED_X, ROCRAND_LFSR113_DEFAULT_SEED_Y, @@ -456,12 +458,16 @@ class lfsr113_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int* data, size_t data_size, double lambda) { - auto dis = m_poisson.get_distribution(lambda); - if(auto* error_status = std::get_if(&dis)) + auto result = m_poisson.get_distribution(lambda); + if(auto* dis = std::get_if(&result)) { - return *error_status; + return generate(data, data_size, *dis); } - return generate(data, data_size, std::get(dis)); + if(auto* dis = std::get_if(&result)) + { + return generate(data, data_size, *dis); + } + return std::get(result); } private: diff --git a/library/src/rng/mrg.hpp b/library/src/rng/mrg.hpp index 9d474f04c..4a98a8a30 100644 --- a/library/src/rng/mrg.hpp +++ b/library/src/rng/mrg.hpp @@ -166,6 +166,8 @@ class mrg_generator_template : public generator_impl_base using poisson_distribution_manager_t = poisson_distribution_manager; using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; + using poisson_approx_distribution_t = + typename poisson_distribution_manager_t::approx_distribution_t; mrg_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -425,13 +427,19 @@ class mrg_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int* data, size_t data_size, double lambda) { - auto dis = m_poisson.get_distribution(lambda); - if(auto* error_status = std::get_if(&dis)) + auto result = m_poisson.get_distribution(lambda); + if(auto* dis = std::get_if(&result)) { - return *error_status; + mrg_engine_poisson_distribution mrg_dis(*dis); + return generate(data, data_size, mrg_dis); } - mrg_engine_poisson_distribution mrg_dis(std::get(dis)); - return generate(data, data_size, mrg_dis); + if(auto* dis = std::get_if(&result)) + { + mrg_engine_poisson_distribution mrg_dis( + *dis); + return generate(data, data_size, mrg_dis); + } + return std::get(result); } private: diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index a2cb3e46f..a65c80d03 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -666,6 +666,8 @@ class mt19937_generator_template : public generator_impl_base using poisson_distribution_manager_t = poisson_distribution_manager; using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; + using poisson_approx_distribution_t = + typename poisson_distribution_manager_t::approx_distribution_t; static constexpr inline unsigned int threads_per_generator = octo_engine_type::threads_per_generator; @@ -1070,12 +1072,16 @@ class mt19937_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int* data, size_t data_size, double lambda) { - auto dis = m_poisson.get_distribution(lambda); - if(auto* error_status = std::get_if(&dis)) + auto result = m_poisson.get_distribution(lambda); + if(auto* dis = std::get_if(&result)) { - return *error_status; + return generate(data, data_size, *dis); } - return generate(data, data_size, std::get(dis)); + if(auto* dis = std::get_if(&result)) + { + return generate(data, data_size, *dis); + } + return std::get(result); } private: diff --git a/library/src/rng/mtgp32.hpp b/library/src/rng/mtgp32.hpp index 0693bc78f..f5460bd30 100644 --- a/library/src/rng/mtgp32.hpp +++ b/library/src/rng/mtgp32.hpp @@ -308,6 +308,8 @@ class mtgp32_generator_template : public generator_impl_base using poisson_distribution_manager_t = poisson_distribution_manager; using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; + using poisson_approx_distribution_t = + typename poisson_distribution_manager_t::approx_distribution_t; mtgp32_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -565,12 +567,16 @@ class mtgp32_generator_template : public generator_impl_base return status; } } - auto dis = m_poisson.get_distribution(lambda); - if(auto* error_status = std::get_if(&dis)) + auto result = m_poisson.get_distribution(lambda); + if(auto* dis = std::get_if(&result)) { - return *error_status; + return generate(data, data_size, *dis); } - return generate(data, data_size, std::get(dis)); + if(auto* dis = std::get_if(&result)) + { + return generate(data, data_size, *dis); + } + return std::get(result); } private: diff --git a/library/src/rng/philox4x32_10.hpp b/library/src/rng/philox4x32_10.hpp index a4c02b1f0..04db05a16 100644 --- a/library/src/rng/philox4x32_10.hpp +++ b/library/src/rng/philox4x32_10.hpp @@ -223,6 +223,8 @@ class philox4x32_10_generator_template : public generator_impl_base using poisson_distribution_manager_t = poisson_distribution_manager; using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; + using poisson_approx_distribution_t = + typename poisson_distribution_manager_t::approx_distribution_t; philox4x32_10_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -390,12 +392,16 @@ class philox4x32_10_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int* data, size_t data_size, double lambda) { - auto dis = m_poisson.get_distribution(lambda); - if(auto* error_status = std::get_if(&dis)) + auto result = m_poisson.get_distribution(lambda); + if(auto* dis = std::get_if(&result)) { - return *error_status; + return generate(data, data_size, *dis); } - return generate(data, data_size, std::get(dis)); + if(auto* dis = std::get_if(&result)) + { + return generate(data, data_size, *dis); + } + return std::get(result); } private: diff --git a/library/src/rng/sobol.hpp b/library/src/rng/sobol.hpp index 84a90b85c..00d3271d7 100644 --- a/library/src/rng/sobol.hpp +++ b/library/src/rng/sobol.hpp @@ -497,6 +497,8 @@ class sobol_generator_template : public generator_impl_base using poisson_distribution_manager_t = poisson_distribution_manager; using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; + using poisson_approx_distribution_t = + typename poisson_distribution_manager_t::approx_distribution_t; sobol_generator_template(unsigned long long offset = 0, rocrand_ordering order = ROCRAND_ORDERING_QUASI_DEFAULT, @@ -735,12 +737,16 @@ class sobol_generator_template : public generator_impl_base { static_assert(Is64 || std::is_same_v, "The 32 bit sobol generator can only generate 32bit poisson"); - auto dis = m_poisson.get_distribution(lambda); - if(auto* error_status = std::get_if(&dis)) + auto result = m_poisson.get_distribution(lambda); + if(auto* dis = std::get_if(&result)) { - return *error_status; + return generate(data, data_size, *dis); } - return generate(data, data_size, std::get(dis)); + if(auto* dis = std::get_if(&result)) + { + return generate(data, data_size, *dis); + } + return std::get(result); } private: diff --git a/library/src/rng/threefry.hpp b/library/src/rng/threefry.hpp index bcf589838..476b7cccb 100644 --- a/library/src/rng/threefry.hpp +++ b/library/src/rng/threefry.hpp @@ -208,6 +208,8 @@ class threefry_generator_template : public generator_impl_base using poisson_distribution_manager_t = poisson_distribution_manager; using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; + using poisson_approx_distribution_t = + typename poisson_distribution_manager_t::approx_distribution_t; threefry_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -406,12 +408,16 @@ class threefry_generator_template : public generator_impl_base template rocrand_status generate_poisson(T* data, size_t data_size, double lambda) { - auto dis = m_poisson.get_distribution(lambda); - if(auto* error_status = std::get_if(&dis)) + auto result = m_poisson.get_distribution(lambda); + if(auto* dis = std::get_if(&result)) { - return *error_status; + return generate(data, data_size, *dis); } - return generate(data, data_size, std::get(dis)); + if(auto* dis = std::get_if(&result)) + { + return generate(data, data_size, *dis); + } + return std::get(result); } private: diff --git a/library/src/rng/xorwow.hpp b/library/src/rng/xorwow.hpp index 1c518c987..1e2418ba4 100644 --- a/library/src/rng/xorwow.hpp +++ b/library/src/rng/xorwow.hpp @@ -173,6 +173,8 @@ class xorwow_generator_template : public generator_impl_base using poisson_distribution_manager_t = poisson_distribution_manager; using poisson_distribution_t = typename poisson_distribution_manager_t::distribution_t; + using poisson_approx_distribution_t = + typename poisson_distribution_manager_t::approx_distribution_t; xorwow_generator_template(unsigned long long seed = 0, unsigned long long offset = 0, @@ -428,12 +430,16 @@ class xorwow_generator_template : public generator_impl_base rocrand_status generate_poisson(unsigned int * data, size_t data_size, double lambda) { - auto dis = m_poisson.get_distribution(lambda); - if(auto* error_status = std::get_if(&dis)) + auto result = m_poisson.get_distribution(lambda); + if(auto* dis = std::get_if(&result)) { - return *error_status; + return generate(data, data_size, *dis); } - return generate(data, data_size, std::get(dis)); + if(auto* dis = std::get_if(&result)) + { + return generate(data, data_size, *dis); + } + return std::get(result); } private: diff --git a/test/internal/test_poisson_distribution.cpp b/test/internal/test_poisson_distribution.cpp index 7d4109af4..32170009d 100644 --- a/test/internal/test_poisson_distribution.cpp +++ b/test/internal/test_poisson_distribution.cpp @@ -50,7 +50,7 @@ TEST_P(poisson_distribution_tests, mean_var) ROCRAND_CHECK( distribution_factory_t::create(poisson_probabilities, size, offset, discrete_dist)); - poisson_distribution dis(discrete_dist, lambda); + poisson_distribution dis(discrete_dist); const size_t samples_count = static_cast(std::max(2.0, sqrt(lambda))) * 100000; std::vector values(samples_count); @@ -87,7 +87,7 @@ TEST_P(poisson_distribution_tests, histogram_compare) ROCRAND_CHECK( distribution_factory_t::create(poisson_probabilities, size, offset, discrete_dist)); - poisson_distribution dis(discrete_dist, lambda); + poisson_distribution dis(discrete_dist); const size_t samples_count = static_cast(std::max(2.0, sqrt(lambda))) * 100000; const size_t bin_size = static_cast(std::max(2.0, sqrt(lambda))); From df7893aedfc0c04fcce057199a882d5180cda18f Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Thu, 20 Jun 2024 13:58:48 +0500 Subject: [PATCH 44/54] clang-format: Break after attributes --- .clang-format | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.clang-format b/.clang-format index b1d464c96..79dea0784 100644 --- a/.clang-format +++ b/.clang-format @@ -45,7 +45,6 @@ AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes -AttributeMacros: ['QUALIFIERS', 'FQUALIFIERS'] BinPackArguments: false BinPackParameters: false BitFieldColonSpacing: Both @@ -88,7 +87,7 @@ EmptyLineAfterAccessModifier: Never EmptyLineBeforeAccessModifier: Always ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +ForEachMacros: [] IfMacros: [] IncludeBlocks: Preserve IndentAccessModifiers: false @@ -139,4 +138,17 @@ SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInParentheses: false SpacesInSquareBrackets: false + +AttributeMacros: ['__host__', '__device__', '__global__', '__forceinline__', '__shared__', '__launch_bounds__'] +# Trick clang into thinking that our C-style attributes are C++-style attributes +Macros: + - __host__=[[host]] + - __device__=[[device]] + - __global__=[[global]] + - __forceinline__=[[forceinline]] + - __shared__=[[shared]] + - __launch_bounds__(x)=[[launch_bounds(x)]] + - __attribute__(x)=[[attribute(x)]] +BreakAfterAttributes: Always + --- From c3b4cf9044337f2a74e1cef3c847e6ac4766fa0c Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Fri, 21 Jun 2024 11:32:49 +0500 Subject: [PATCH 45/54] Add missing __forceinline__ to improve performance on ROCm 6.2 hipcc from ROCm 6.2 does not add `-mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false` by default. --- library/src/rng/distribution/log_normal.hpp | 137 +++++++++----------- library/src/rng/distribution/normal.hpp | 137 +++++++++----------- library/src/rng/distribution/uniform.hpp | 130 ++++++++++--------- library/src/rng/mt19937.hpp | 82 ++++++------ library/src/rng/mtgp32.hpp | 48 ++++--- library/src/rng/sobol.hpp | 7 +- 6 files changed, 263 insertions(+), 278 deletions(-) diff --git a/library/src/rng/distribution/log_normal.hpp b/library/src/rng/distribution/log_normal.hpp index 2f9dc7113..e0a5861ba 100644 --- a/library/src/rng/distribution/log_normal.hpp +++ b/library/src/rng/distribution/log_normal.hpp @@ -51,20 +51,18 @@ struct log_normal_distribution; template<> struct log_normal_distribution { - static constexpr unsigned int input_width = 2; + static constexpr unsigned int input_width = 2; static constexpr unsigned int output_width = 2; const float mean; const float stddev; - __host__ __device__ - log_normal_distribution(float mean, float stddev) - : mean(mean), stddev(stddev) {} + log_normal_distribution(float mean, float stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[2], float (&output)[2]) const { - float2 v = rocrand_device::detail::normal_distribution2(input[0], input[1]); + float2 v = rocrand_device::detail::normal_distribution2(input[0], input[1]); output[0] = expf(mean + v.x * stddev); output[1] = expf(mean + v.y * stddev); } @@ -79,11 +77,10 @@ struct log_normal_distribution const double mean; const double stddev; - __host__ __device__ log_normal_distribution(double mean, double stddev) - : mean(mean), stddev(stddev) - {} + log_normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ void operator()(const unsigned int (&input)[1], double (&output)[1]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned int (&input)[1], double (&output)[1]) const { double v = rocrand_device::detail::normal_distribution_double(input[0]); output[0] = exp(mean + v * stddev); @@ -93,22 +90,19 @@ struct log_normal_distribution template<> struct log_normal_distribution { - static constexpr unsigned int input_width = 4; + static constexpr unsigned int input_width = 4; static constexpr unsigned int output_width = 2; const double mean; const double stddev; - __host__ __device__ - log_normal_distribution(double mean, double stddev) - : mean(mean), stddev(stddev) {} + log_normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[4], double (&output)[2]) const { double2 v = rocrand_device::detail::normal_distribution_double2( - make_uint4(input[0], input[1], input[2], input[3]) - ); + make_uint4(input[0], input[1], input[2], input[3])); output[0] = exp(mean + v.x * stddev); output[1] = exp(mean + v.y * stddev); } @@ -117,26 +111,25 @@ struct log_normal_distribution template<> struct log_normal_distribution<__half, unsigned int, 4> { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 2; const __half2 mean; const __half2 stddev; - __host__ __device__ - log_normal_distribution(__half mean, __half stddev) - : mean(mean, mean), stddev(stddev, stddev) {} + log_normal_distribution(__half mean, __half stddev) : mean(mean, mean), stddev(stddev, stddev) + {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], __half (&output)[2]) const { __half2 v = rocrand_device::detail::normal_distribution_half2(input[0]); - #if defined(ROCRAND_HALF_MATH_SUPPORTED) - *reinterpret_cast<__half2 *>(output) = h2exp(__hfma2(v, stddev, mean)); - #else +#if defined(ROCRAND_HALF_MATH_SUPPORTED) + *reinterpret_cast<__half2*>(output) = h2exp(__hfma2(v, stddev, mean)); +#else output[0] = __float2half(expf(__low2float(mean) + (__low2float(stddev) * __low2float(v)))); output[1] = __float2half(expf(__low2float(mean) + (__low2float(stddev) * __high2float(v)))); - #endif +#endif } }; @@ -151,12 +144,10 @@ struct log_normal_distribution const float mean; const float stddev; - __host__ __device__ log_normal_distribution(float mean, float stddev) - : mean(mean), stddev(stddev) - {} + log_normal_distribution(float mean, float stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ void operator()(const unsigned long long (&input)[1], - float (&output)[2]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], float (&output)[2]) const { float2 v = rocrand_device::detail::normal_distribution2(input[0]); output[0] = expf(mean + v.x * stddev); @@ -173,12 +164,10 @@ struct log_normal_distribution const double mean; const double stddev; - __host__ __device__ log_normal_distribution(double mean, double stddev) - : mean(mean), stddev(stddev) - {} + log_normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ void operator()(const unsigned long long (&input)[2], - double (&output)[2]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[2], double (&output)[2]) const { double2 v = rocrand_device::detail::normal_distribution_double2( make_ulonglong2(input[0], input[1])); @@ -196,12 +185,11 @@ struct log_normal_distribution<__half, unsigned long long, 4> const __half2 mean; const __half2 stddev; - __host__ __device__ log_normal_distribution(__half mean, __half stddev) - : mean(mean, mean), stddev(stddev, stddev) + log_normal_distribution(__half mean, __half stddev) : mean(mean, mean), stddev(stddev, stddev) {} - __host__ __device__ void operator()(const unsigned long long (&input)[1], - __half (&output)[2]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], __half (&output)[2]) const { __half2 v = rocrand_device::detail::normal_distribution_half2(input[0]); #if defined(ROCRAND_HALF_MATH_SUPPORTED) @@ -221,17 +209,15 @@ struct mrg_engine_log_normal_distribution; template struct mrg_engine_log_normal_distribution { - static constexpr unsigned int input_width = 2; + static constexpr unsigned int input_width = 2; static constexpr unsigned int output_width = 2; const float mean; const float stddev; - __host__ __device__ mrg_engine_log_normal_distribution(float mean, float stddev) - : mean(mean), stddev(stddev) - {} + mrg_engine_log_normal_distribution(float mean, float stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[2], float (&output)[2]) const { float2 v = rocrand_device::detail::mrg_normal_distribution2(input[0], input[1]); @@ -243,17 +229,15 @@ struct mrg_engine_log_normal_distribution template struct mrg_engine_log_normal_distribution { - static constexpr unsigned int input_width = 2; + static constexpr unsigned int input_width = 2; static constexpr unsigned int output_width = 2; const double mean; const double stddev; - __host__ __device__ mrg_engine_log_normal_distribution(double mean, double stddev) - : mean(mean), stddev(stddev) - {} + mrg_engine_log_normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[2], double (&output)[2]) const { double2 v = rocrand_device::detail::mrg_normal_distribution_double2(input[0], @@ -266,26 +250,27 @@ struct mrg_engine_log_normal_distribution template struct mrg_engine_log_normal_distribution<__half, state_type> { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 2; const __half2 mean; const __half2 stddev; - __host__ __device__ mrg_engine_log_normal_distribution(__half mean, __half stddev) // cppcheck-suppress uninitMemberVar - : mean(mean, mean), stddev(stddev, stddev) + mrg_engine_log_normal_distribution(__half mean, + __half stddev) // cppcheck-suppress uninitMemberVar + : mean(mean, mean), stddev(stddev, stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], __half (&output)[2]) const { __half2 v = rocrand_device::detail::mrg_normal_distribution_half2(input[0]); - #if defined(ROCRAND_HALF_MATH_SUPPORTED) - *reinterpret_cast<__half2 *>(output) = h2exp(__hfma2(v, stddev, mean)); - #else +#if defined(ROCRAND_HALF_MATH_SUPPORTED) + *reinterpret_cast<__half2*>(output) = h2exp(__hfma2(v, stddev, mean)); +#else output[0] = __float2half(expf(__low2float(mean) + (__low2float(stddev) * __low2float(v)))); output[1] = __float2half(expf(__low2float(mean) + (__low2float(stddev) * __high2float(v)))); - #endif +#endif } }; @@ -298,7 +283,7 @@ template<> struct mrg_log_normal_distribution : mrg_engine_log_normal_distribution { - __host__ __device__ mrg_log_normal_distribution(float mean, float stddev) + mrg_log_normal_distribution(float mean, float stddev) : mrg_engine_log_normal_distribution(mean, stddev) {} }; @@ -307,7 +292,7 @@ template<> struct mrg_log_normal_distribution : mrg_engine_log_normal_distribution { - __host__ __device__ mrg_log_normal_distribution(double mean, double stddev) + mrg_log_normal_distribution(double mean, double stddev) : mrg_engine_log_normal_distribution(mean, stddev) {} }; @@ -316,7 +301,7 @@ template<> struct mrg_log_normal_distribution<__half> : mrg_engine_log_normal_distribution<__half, rocrand_state_mrg32k3a> { - __host__ __device__ mrg_log_normal_distribution(__half mean, __half stddev) + mrg_log_normal_distribution(__half mean, __half stddev) : mrg_engine_log_normal_distribution(mean, stddev) {} }; @@ -332,12 +317,12 @@ struct sobol_log_normal_distribution const float mean; const float stddev; - __host__ __device__ - sobol_log_normal_distribution(float mean, float stddev) - : mean(mean), stddev(stddev) {} + sobol_log_normal_distribution(float mean, float stddev) : mean(mean), stddev(stddev) {} template - __host__ __device__ float operator()(const DirectionVectorType x) const + __forceinline__ __host__ __device__ + float + operator()(const DirectionVectorType x) const { float v = rocrand_device::detail::normal_distribution(x); return expf(mean + (stddev * v)); @@ -350,12 +335,12 @@ struct sobol_log_normal_distribution const double mean; const double stddev; - __host__ __device__ - sobol_log_normal_distribution(double mean, double stddev) - : mean(mean), stddev(stddev) {} + sobol_log_normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} template - __host__ __device__ double operator()(const DirectionVectorType x) const + __forceinline__ __host__ __device__ + double + operator()(const DirectionVectorType x) const { double v = rocrand_device::detail::normal_distribution_double(x); return exp(mean + (stddev * v)); @@ -368,19 +353,19 @@ struct sobol_log_normal_distribution<__half> const __half mean; const __half stddev; - __host__ __device__ - sobol_log_normal_distribution(__half mean, __half stddev) - : mean(mean), stddev(stddev) {} + sobol_log_normal_distribution(__half mean, __half stddev) : mean(mean), stddev(stddev) {} template - __host__ __device__ __half operator()(const DirectionVectorType x) const + __forceinline__ __host__ __device__ + __half + operator()(const DirectionVectorType x) const { float v = rocrand_device::detail::normal_distribution(x); - #if defined(ROCRAND_HALF_MATH_SUPPORTED) +#if defined(ROCRAND_HALF_MATH_SUPPORTED) return hexp(__hfma(__float2half(v), stddev, mean)); - #else +#else return __float2half(expf(__half2float(mean) + (__half2float(stddev) * v))); - #endif +#endif } }; diff --git a/library/src/rng/distribution/normal.hpp b/library/src/rng/distribution/normal.hpp index 6465ef433..59bb8cfbf 100644 --- a/library/src/rng/distribution/normal.hpp +++ b/library/src/rng/distribution/normal.hpp @@ -49,20 +49,18 @@ struct normal_distribution; template<> struct normal_distribution { - static constexpr unsigned int input_width = 2; + static constexpr unsigned int input_width = 2; static constexpr unsigned int output_width = 2; const float mean; const float stddev; - __host__ __device__ - normal_distribution(float mean, float stddev) - : mean(mean), stddev(stddev) {} + normal_distribution(float mean, float stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[2], float (&output)[2]) const { - float2 v = rocrand_device::detail::normal_distribution2(input[0], input[1]); + float2 v = rocrand_device::detail::normal_distribution2(input[0], input[1]); output[0] = mean + v.x * stddev; output[1] = mean + v.y * stddev; } @@ -77,10 +75,10 @@ struct normal_distribution const double mean; const double stddev; - __host__ __device__ normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) - {} + normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ void operator()(const unsigned int (&input)[1], double (&output)[1]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned int (&input)[1], double (&output)[1]) const { double v = rocrand_device::detail::normal_distribution_double(input[0]); output[0] = mean + v * stddev; @@ -90,22 +88,19 @@ struct normal_distribution template<> struct normal_distribution { - static constexpr unsigned int input_width = 4; + static constexpr unsigned int input_width = 4; static constexpr unsigned int output_width = 2; const double mean; const double stddev; - __host__ __device__ - normal_distribution(double mean, double stddev) - : mean(mean), stddev(stddev) {} + normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[4], double (&output)[2]) const { double2 v = rocrand_device::detail::normal_distribution_double2( - make_uint4(input[0], input[1], input[2], input[3]) - ); + make_uint4(input[0], input[1], input[2], input[3])); output[0] = mean + v.x * stddev; output[1] = mean + v.y * stddev; } @@ -114,26 +109,24 @@ struct normal_distribution template<> struct normal_distribution<__half, unsigned int, 4> { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 2; const __half2 mean; const __half2 stddev; - __host__ __device__ - normal_distribution(__half mean, __half stddev) - : mean(mean, mean), stddev(stddev, stddev) {} + normal_distribution(__half mean, __half stddev) : mean(mean, mean), stddev(stddev, stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], __half (&output)[2]) const { __half2 v = rocrand_device::detail::normal_distribution_half2(input[0]); - #if defined(ROCRAND_HALF_MATH_SUPPORTED) - *reinterpret_cast<__half2 *>(output) = __hfma2(v, stddev, mean); - #else +#if defined(ROCRAND_HALF_MATH_SUPPORTED) + *reinterpret_cast<__half2*>(output) = __hfma2(v, stddev, mean); +#else output[0] = __float2half(__low2float(mean) + (__low2float(stddev) * __low2float(v))); output[1] = __float2half(__low2float(mean) + (__low2float(stddev) * __high2float(v))); - #endif +#endif } }; @@ -148,11 +141,10 @@ struct normal_distribution const float mean; const float stddev; - __host__ __device__ normal_distribution(float mean, float stddev) : mean(mean), stddev(stddev) - {} + normal_distribution(float mean, float stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ void operator()(const unsigned long long (&input)[1], - float (&output)[2]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], float (&output)[2]) const { float2 v = rocrand_device::detail::normal_distribution2(input[0]); output[0] = mean + v.x * stddev; @@ -169,11 +161,10 @@ struct normal_distribution const double mean; const double stddev; - __host__ __device__ normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) - {} + normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ void operator()(const unsigned long long (&input)[2], - double (&output)[2]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[2], double (&output)[2]) const { double2 v = rocrand_device::detail::normal_distribution_double2( make_ulonglong2(input[0], input[1])); @@ -191,12 +182,10 @@ struct normal_distribution<__half, unsigned long long, 4> const __half2 mean; const __half2 stddev; - __host__ __device__ normal_distribution(__half mean, __half stddev) - : mean(mean, mean), stddev(stddev, stddev) - {} + normal_distribution(__half mean, __half stddev) : mean(mean, mean), stddev(stddev, stddev) {} - __host__ __device__ void operator()(const unsigned long long (&input)[1], - __half (&output)[2]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], __half (&output)[2]) const { __half2 v = rocrand_device::detail::normal_distribution_half2(input[0]); #if defined(ROCRAND_HALF_MATH_SUPPORTED) @@ -216,17 +205,15 @@ struct mrg_engine_normal_distribution; template struct mrg_engine_normal_distribution { - static constexpr unsigned int input_width = 2; + static constexpr unsigned int input_width = 2; static constexpr unsigned int output_width = 2; const float mean; const float stddev; - __host__ __device__ mrg_engine_normal_distribution(float mean, float stddev) - : mean(mean), stddev(stddev) - {} + mrg_engine_normal_distribution(float mean, float stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[2], float (&output)[2]) const { float2 v = rocrand_device::detail::mrg_normal_distribution2(input[0], input[1]); @@ -238,17 +225,15 @@ struct mrg_engine_normal_distribution template struct mrg_engine_normal_distribution { - static constexpr unsigned int input_width = 2; + static constexpr unsigned int input_width = 2; static constexpr unsigned int output_width = 2; const double mean; const double stddev; - __host__ __device__ mrg_engine_normal_distribution(double mean, double stddev) - : mean(mean), stddev(stddev) - {} + mrg_engine_normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[2], double (&output)[2]) const { double2 v = rocrand_device::detail::mrg_normal_distribution_double2(input[0], @@ -261,26 +246,26 @@ struct mrg_engine_normal_distribution template struct mrg_engine_normal_distribution<__half, state_type> { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 2; const __half2 mean; const __half2 stddev; - __host__ __device__ mrg_engine_normal_distribution(__half mean, __half stddev) // cppcheck-suppress uninitMemberVar - : mean(mean, mean), stddev(stddev, stddev) + mrg_engine_normal_distribution(__half mean, __half stddev) // cppcheck-suppress uninitMemberVar + : mean(mean, mean), stddev(stddev, stddev) {} - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], __half (&output)[2]) const { __half2 v = rocrand_device::detail::mrg_normal_distribution_half2(input[0]); - #if defined(ROCRAND_HALF_MATH_SUPPORTED) - *reinterpret_cast<__half2 *>(output) = __hfma2(v, stddev, mean); - #else +#if defined(ROCRAND_HALF_MATH_SUPPORTED) + *reinterpret_cast<__half2*>(output) = __hfma2(v, stddev, mean); +#else output[0] = __float2half(__low2float(mean) + (__low2float(stddev) * __low2float(v))); output[1] = __float2half(__low2float(mean) + (__low2float(stddev) * __high2float(v))); - #endif +#endif } }; @@ -293,8 +278,7 @@ template<> struct mrg_normal_distribution : mrg_engine_normal_distribution { - __host__ __device__ mrg_normal_distribution(float mean, float stddev) - : mrg_engine_normal_distribution(mean, stddev) + mrg_normal_distribution(float mean, float stddev) : mrg_engine_normal_distribution(mean, stddev) {} }; @@ -302,7 +286,7 @@ template<> struct mrg_normal_distribution : mrg_engine_normal_distribution { - __host__ __device__ mrg_normal_distribution(double mean, double stddev) + mrg_normal_distribution(double mean, double stddev) : mrg_engine_normal_distribution(mean, stddev) {} }; @@ -311,7 +295,7 @@ template<> struct mrg_normal_distribution<__half> : mrg_engine_normal_distribution<__half, rocrand_state_mrg32k3a> { - __host__ __device__ mrg_normal_distribution(__half mean, __half stddev) + mrg_normal_distribution(__half mean, __half stddev) : mrg_engine_normal_distribution(mean, stddev) {} }; @@ -327,13 +311,12 @@ struct sobol_normal_distribution const float mean; const float stddev; - __host__ __device__ - sobol_normal_distribution(float mean, float stddev) - : mean(mean), stddev(stddev) {} + sobol_normal_distribution(float mean, float stddev) : mean(mean), stddev(stddev) {} template - __host__ __device__ - float operator()(const DirectionVectorType x) const + __forceinline__ __host__ __device__ + float + operator()(const DirectionVectorType x) const { float v = rocrand_device::detail::normal_distribution(x); return mean + v * stddev; @@ -346,13 +329,12 @@ struct sobol_normal_distribution const double mean; const double stddev; - __host__ __device__ - sobol_normal_distribution(double mean, double stddev) - : mean(mean), stddev(stddev) {} + sobol_normal_distribution(double mean, double stddev) : mean(mean), stddev(stddev) {} template - __host__ __device__ - double operator()(const DirectionVectorType x) const + __forceinline__ __host__ __device__ + double + operator()(const DirectionVectorType x) const { double v = rocrand_device::detail::normal_distribution_double(x); return mean + v * stddev; @@ -365,20 +347,19 @@ struct sobol_normal_distribution<__half> const __half mean; const __half stddev; - __host__ __device__ - sobol_normal_distribution(__half mean, __half stddev) - : mean(mean), stddev(stddev) {} + sobol_normal_distribution(__half mean, __half stddev) : mean(mean), stddev(stddev) {} template - __host__ __device__ - __half operator()(const DirectionVectorType x) const + __forceinline__ __host__ __device__ + __half + operator()(const DirectionVectorType x) const { float v = rocrand_device::detail::normal_distribution(x); - #if defined(ROCRAND_HALF_MATH_SUPPORTED) +#if defined(ROCRAND_HALF_MATH_SUPPORTED) return __hfma(__float2half(v), stddev, mean); - #else +#else return __float2half(__half2float(mean) + v * __half2float(stddev)); - #endif +#endif } }; diff --git a/library/src/rng/distribution/uniform.hpp b/library/src/rng/distribution/uniform.hpp index ae6850d40..61771873a 100644 --- a/library/src/rng/distribution/uniform.hpp +++ b/library/src/rng/distribution/uniform.hpp @@ -40,52 +40,52 @@ struct uniform_distribution; template<> struct uniform_distribution { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], unsigned int (&output)[1]) const { unsigned int v = input[0]; - output[0] = v; + output[0] = v; } }; template<> struct uniform_distribution { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 4; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], unsigned char (&output)[4]) const { - unsigned int v = input[0]; - *reinterpret_cast(output) = v; + unsigned int v = input[0]; + *reinterpret_cast(output) = v; } }; template<> struct uniform_distribution { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 2; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], unsigned short (&output)[2]) const { - unsigned int v = input[0]; - *reinterpret_cast(output) = v; + unsigned int v = input[0]; + *reinterpret_cast(output) = v; } }; template<> struct uniform_distribution { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], float (&output)[1]) const { output[0] = rocrand_device::detail::uniform_distribution(input[0]); @@ -95,10 +95,10 @@ struct uniform_distribution template<> struct uniform_distribution { - static constexpr unsigned int input_width = 2; + static constexpr unsigned int input_width = 2; static constexpr unsigned int output_width = 1; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[2], double (&output)[1]) const { output[0] = rocrand_device::detail::uniform_distribution_double(input[0], input[1]); @@ -108,14 +108,14 @@ struct uniform_distribution template<> struct uniform_distribution<__half, unsigned int> { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 2; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], __half (&output)[2]) const { unsigned int v = input[0]; - output[0] = rocrand_device::detail::uniform_distribution_half(static_cast(v)); + output[0] = rocrand_device::detail::uniform_distribution_half(static_cast(v)); output[1] = rocrand_device::detail::uniform_distribution_half(static_cast(v >> 16)); } }; @@ -128,8 +128,8 @@ struct uniform_distribution static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; - __host__ __device__ void operator()(const unsigned long long (&input)[1], - unsigned long long (&output)[1]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], unsigned long long (&output)[1]) const { unsigned long long v = input[0]; output[0] = v; @@ -142,8 +142,8 @@ struct uniform_distribution static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 8; - __host__ __device__ void operator()(const unsigned long long (&input)[1], - unsigned char (&output)[8]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], unsigned char (&output)[8]) const { unsigned long long v = input[0]; *reinterpret_cast(output) = v; @@ -156,8 +156,8 @@ struct uniform_distribution static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 4; - __host__ __device__ void operator()(const unsigned long long (&input)[1], - unsigned short (&output)[4]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], unsigned short (&output)[4]) const { unsigned long long v = input[0]; *reinterpret_cast(output) = v; @@ -170,8 +170,8 @@ struct uniform_distribution static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 2; - __host__ __device__ void operator()(const unsigned long long (&input)[1], - unsigned int (&output)[2]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], unsigned int (&output)[2]) const { unsigned long long v = input[0]; *reinterpret_cast(output) = v; @@ -184,8 +184,8 @@ struct uniform_distribution static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; - __host__ __device__ void operator()(const unsigned long long (&input)[1], - float (&output)[1]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], float (&output)[1]) const { output[0] = rocrand_device::detail::uniform_distribution(input[0]); } @@ -197,8 +197,8 @@ struct uniform_distribution static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; - __host__ __device__ void operator()(const unsigned long long (&input)[1], - double (&output)[1]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], double (&output)[1]) const { output[0] = rocrand_device::detail::uniform_distribution_double(input[0]); } @@ -210,8 +210,8 @@ struct uniform_distribution<__half, unsigned long long> static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 4; - __host__ __device__ void operator()(const unsigned long long (&input)[1], - __half (&output)[4]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned long long (&input)[1], __half (&output)[4]) const { unsigned long long v = input[0]; output[0] = rocrand_device::detail::uniform_distribution_half(static_cast(v)); @@ -229,10 +229,10 @@ struct mrg_engine_uniform_distribution; template struct mrg_engine_uniform_distribution { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], unsigned int (&output)[1]) const { unsigned int v @@ -244,40 +244,40 @@ struct mrg_engine_uniform_distribution template struct mrg_engine_uniform_distribution { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 4; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], unsigned char (&output)[4]) const { unsigned int v = rocrand_device::detail::mrg_uniform_distribution_uint(input[0]); - *reinterpret_cast(output) = v; + *reinterpret_cast(output) = v; } }; template struct mrg_engine_uniform_distribution { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 2; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], unsigned short (&output)[2]) const { unsigned int v = rocrand_device::detail::mrg_uniform_distribution_uint(input[0]); - *reinterpret_cast(output) = v; + *reinterpret_cast(output) = v; } }; template struct mrg_engine_uniform_distribution { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], float (&output)[1]) const { output[0] = rocrand_device::detail::mrg_uniform_distribution(input[0]); @@ -287,10 +287,10 @@ struct mrg_engine_uniform_distribution template struct mrg_engine_uniform_distribution { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 1; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], double (&output)[1]) const { output[0] = rocrand_device::detail::mrg_uniform_distribution_double(input[0]); @@ -300,10 +300,10 @@ struct mrg_engine_uniform_distribution template struct mrg_engine_uniform_distribution<__half, state_type> { - static constexpr unsigned int input_width = 1; + static constexpr unsigned int input_width = 1; static constexpr unsigned int output_width = 2; - __host__ __device__ + __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], __half (&output)[2]) const { unsigned int v @@ -356,8 +356,9 @@ struct sobol_uniform_distribution; template<> struct sobol_uniform_distribution { - __host__ __device__ - unsigned long long int operator()(const unsigned long long int v) const + __forceinline__ __host__ __device__ + unsigned long long int + operator()(const unsigned long long int v) const { return v; } @@ -367,8 +368,9 @@ template<> struct sobol_uniform_distribution { template - __host__ __device__ - unsigned int operator()(const DirectionVectorType v) const + __forceinline__ __host__ __device__ + unsigned int + operator()(const DirectionVectorType v) const { constexpr int bit_shift = ((sizeof(DirectionVectorType) - sizeof(unsigned int)) * 8); return v >> bit_shift; @@ -379,8 +381,9 @@ template<> struct sobol_uniform_distribution { template - __host__ __device__ - unsigned char operator()(const DirectionVectorType v) const + __forceinline__ __host__ __device__ + unsigned char + operator()(const DirectionVectorType v) const { constexpr int bit_shift = ((sizeof(DirectionVectorType) - sizeof(unsigned char)) * 8); return static_cast(v >> bit_shift); @@ -391,8 +394,9 @@ template<> struct sobol_uniform_distribution { template - __host__ __device__ - unsigned short operator()(const DirectionVectorType v) const + __forceinline__ __host__ __device__ + unsigned short + operator()(const DirectionVectorType v) const { constexpr int bit_shift = ((sizeof(DirectionVectorType) - sizeof(unsigned short)) * 8); return static_cast(v >> bit_shift); @@ -403,8 +407,9 @@ template<> struct sobol_uniform_distribution { template - __host__ __device__ - float operator()(const DirectionVectorType v) const + __forceinline__ __host__ __device__ + float + operator()(const DirectionVectorType v) const { return rocrand_device::detail::uniform_distribution(v); } @@ -414,8 +419,9 @@ template<> struct sobol_uniform_distribution { template - __host__ __device__ - double operator()(const DirectionVectorType v) const + __forceinline__ __host__ __device__ + double + operator()(const DirectionVectorType v) const { return rocrand_device::detail::uniform_distribution_double(v); } @@ -425,11 +431,13 @@ template<> struct sobol_uniform_distribution<__half> { template - __host__ __device__ - __half operator()(const DirectionVectorType v) const + __forceinline__ __host__ __device__ + __half + operator()(const DirectionVectorType v) const { constexpr int bit_shift = ((sizeof(DirectionVectorType) - sizeof(unsigned short)) * 8); - return rocrand_device::detail::uniform_distribution_half(static_cast(v >> bit_shift)); + return rocrand_device::detail::uniform_distribution_half( + static_cast(v >> bit_shift)); } }; diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index a65c80d03..6ded56c24 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -81,13 +81,14 @@ __forceinline__ __device__ __host__ unsigned int wrap_n(unsigned int i) // Config is not actually used for kernel launch here, but is needed to check the number of generators // As this kernel is not dependent on any type just use void for the config, as mt19937 is not tuned for types independently, so all configs are the same for different types. template -__host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, - dim3 thread_idx, - dim3 /*grid_dim*/, - [[maybe_unused]] dim3 block_dim, - unsigned int* __restrict__ engines, - unsigned long long seed, - const unsigned int* __restrict__ jump) +__forceinline__ __host__ __device__ +void jump_ahead_mt19937(dim3 block_idx, + dim3 thread_idx, + dim3 /*grid_dim*/, + [[maybe_unused]] dim3 block_dim, + unsigned int* __restrict__ engines, + unsigned long long seed, + const unsigned int* __restrict__ jump) { #if defined(__HIP_DEVICE_COMPILE__) static constexpr bool isDevice = true; @@ -274,12 +275,13 @@ __host__ __device__ inline void jump_ahead_mt19937(dim3 block_idx, // This kernel is not explicitly tuned, but uses the same configs as the generate-kernels. // As this kernel is not dependent on any type just use void for the config, as mt19937 is not tuned for types independently, so all configs are the same for different types. template -__host__ __device__ inline void init_engines_mt19937(dim3 block_idx, - dim3 thread_idx, - dim3 /*grid_dim*/, - dim3 /*block_dim*/, - unsigned int* __restrict__ octo_engines, - const unsigned int* __restrict__ engines) +__forceinline__ __host__ __device__ +void init_engines_mt19937(dim3 block_idx, + dim3 thread_idx, + dim3 /*grid_dim*/, + dim3 /*block_dim*/, + unsigned int* __restrict__ octo_engines, + const unsigned int* __restrict__ engines) { constexpr generator_config config = ConfigProvider::template device_config(IsDynamic); constexpr unsigned int block_size = config.threads; @@ -301,19 +303,20 @@ __host__ __device__ inline void init_engines_mt19937(dim3 block_idx, } template -__host__ __device__ inline void generate_short_mt19937(dim3 block_idx, - dim3 thread_idx, - dim3 /*grid_dim*/, - dim3 /*block_dim*/, - unsigned int* __restrict__ engines, - const unsigned int start_input, - T* __restrict__ data, - const size_t size, - VecT* __restrict__ vec_data, - const size_t vec_size, - const unsigned int head_size, - const unsigned int tail_size, - Distribution distribution) +__forceinline__ __host__ __device__ +void generate_short_mt19937(dim3 block_idx, + dim3 thread_idx, + dim3 /*grid_dim*/, + dim3 /*block_dim*/, + unsigned int* __restrict__ engines, + const unsigned int start_input, + T* __restrict__ data, + const size_t size, + VecT* __restrict__ vec_data, + const size_t vec_size, + const unsigned int head_size, + const unsigned int tail_size, + Distribution distribution) { #if !defined(__HIP_DEVICE_COMPILE__) if(thread_idx.x % 8 != 0) @@ -428,19 +431,20 @@ __host__ __device__ inline void generate_short_mt19937(dim3 block_idx, } template -__host__ __device__ inline void generate_long_mt19937(dim3 block_idx, - dim3 thread_idx, - dim3 /*grid_dim*/, - dim3 block_dim, - unsigned int* __restrict__ engines, - const unsigned int start_input, - T* __restrict__ data, - const size_t size, - VecT* __restrict__ vec_data, - const size_t vec_size, - const unsigned int head_size, - const unsigned int tail_size, - Distribution distribution) +__forceinline__ __host__ __device__ +void generate_long_mt19937(dim3 block_idx, + dim3 thread_idx, + dim3 /*grid_dim*/, + dim3 block_dim, + unsigned int* __restrict__ engines, + const unsigned int start_input, + T* __restrict__ data, + const size_t size, + VecT* __restrict__ vec_data, + const size_t vec_size, + const unsigned int head_size, + const unsigned int tail_size, + Distribution distribution) { #if !defined(__HIP_DEVICE_COMPILE__) if(thread_idx.x % 8 != 0) diff --git a/library/src/rng/mtgp32.hpp b/library/src/rng/mtgp32.hpp index f5460bd30..feeadec94 100644 --- a/library/src/rng/mtgp32.hpp +++ b/library/src/rng/mtgp32.hpp @@ -80,7 +80,8 @@ struct mtgp32_device_engine : ::rocrand_device::mtgp32_engine // suppress warning about no initialization for __shared__ variables __host__ __device__ mtgp32_device_engine(){}; - __host__ __device__ unsigned int next() + __forceinline__ __host__ __device__ + unsigned int next() { #ifdef __HIP_DEVICE_COMPILE__ // all threads in block produce one value and advance the state by that many values @@ -114,10 +115,11 @@ __host__ void generate(unsigned int (&input)[BlockSize][Distribution::input_widt } template -__device__ void generate(unsigned int (&input)[Distribution::input_width], - T (&output)[Distribution::output_width], - Distribution& distribution, - mtgp32_device_engine& engine) +__forceinline__ __device__ +void generate(unsigned int (&input)[Distribution::input_width], + T (&output)[Distribution::output_width], + Distribution& distribution, + mtgp32_device_engine& engine) { for(unsigned int i = 0; i < Distribution::input_width; i++) { @@ -136,7 +138,8 @@ __host__ void save_vec_n(vec_type* vec_data, T (&output)[BlockSize][output_width } template -__device__ void save_vec_n(vec_type* vec_data, T (&output)[output_width], size_t index) +__forceinline__ __device__ +void save_vec_n(vec_type* vec_data, T (&output)[output_width], size_t index) { vec_data[index] = *reinterpret_cast(output); } @@ -155,7 +158,8 @@ __host__ void } template -__device__ void save_n(vec_type* vec_data, T (&output)[output_width], size_t index, size_t vec_n) +__forceinline__ __device__ +void save_n(vec_type* vec_data, T (&output)[output_width], size_t index, size_t vec_n) { if(index < vec_n) { @@ -164,13 +168,14 @@ __device__ void save_n(vec_type* vec_data, T (&output)[output_width], size_t ind } template -__host__ __device__ void save_head_tail_impl(T (&output)[output_width], - size_t index, - T* data, - size_t n, - size_t head_size, - size_t tail_size, - size_t vec_n_up) +__forceinline__ __host__ __device__ +void save_head_tail_impl(T (&output)[output_width], + size_t index, + T* data, + size_t n, + size_t head_size, + size_t tail_size, + size_t vec_n_up) { if(index == vec_n_up) { @@ -211,13 +216,14 @@ __host__ void save_head_tail(T (&output)[BlockSize][output_width], } template -__device__ void save_head_tail(T (&output)[output_width], - size_t index, - T* data, - size_t n, - size_t head_size, - size_t tail_size, - size_t vec_n_up) +__forceinline__ __device__ +void save_head_tail(T (&output)[output_width], + size_t index, + T* data, + size_t n, + size_t head_size, + size_t tail_size, + size_t vec_n_up) { save_head_tail_impl(output, index, data, n, head_size, tail_size, vec_n_up); } diff --git a/library/src/rng/sobol.hpp b/library/src/rng/sobol.hpp index 00d3271d7..4d849bca4 100644 --- a/library/src/rng/sobol.hpp +++ b/library/src/rng/sobol.hpp @@ -53,9 +53,10 @@ namespace rocrand_impl::host { template -__host__ __device__ Engine create_engine(const Constant* vectors, - [[maybe_unused]] Constant scramble_constant, - const unsigned int offset) +__forceinline__ __host__ __device__ +Engine create_engine(const Constant* vectors, + [[maybe_unused]] Constant scramble_constant, + const unsigned int offset) { if constexpr(Scrambled) { From 07c873ce6cea50ab17850903edabc7187c13e8bf Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Mon, 24 Jun 2024 16:40:26 +0500 Subject: [PATCH 46/54] Remove meaningless code in xorwow introduced during rebase/merge --- library/src/rng/xorwow.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/library/src/rng/xorwow.hpp b/library/src/rng/xorwow.hpp index 1e2418ba4..659df06a0 100644 --- a/library/src/rng/xorwow.hpp +++ b/library/src/rng/xorwow.hpp @@ -129,9 +129,6 @@ __host__ __device__ __forceinline__ void generate_xorwow(dim3 block_idx, } distribution(input, output); - vec_data[index] = *reinterpret_cast(output); - // Next position - index += num_engines; for(unsigned int o = 0; o < output_width; o++) { if(o < head_size) From 67f4fdbf7192aab7cd9eed5dfb4d8046e6f77151 Mon Sep 17 00:00:00 2001 From: Nara Prasetya Date: Mon, 1 Jul 2024 15:11:58 +0000 Subject: [PATCH 47/54] style: update formatting --- library/src/rng/distribution/discrete.hpp | 4 +-- library/src/rng/distribution/poisson.hpp | 23 ++++++++------ library/src/rng/mt19937.hpp | 3 +- library/src/rng/mt19937_octo_engine.hpp | 37 ++++++++++++----------- library/src/rng/utils/cpp_utils.hpp | 6 ++-- 5 files changed, 42 insertions(+), 31 deletions(-) diff --git a/library/src/rng/distribution/discrete.hpp b/library/src/rng/distribution/discrete.hpp index 9e3bf1779..ab02b28d4 100644 --- a/library/src/rng/distribution/discrete.hpp +++ b/library/src/rng/distribution/discrete.hpp @@ -78,8 +78,8 @@ class discrete_distribution_base } template - __forceinline__ __host__ __device__ void operator()(const T (&input)[1], - unsigned int output[1]) const + __forceinline__ __host__ __device__ + void operator()(const T (&input)[1], unsigned int output[1]) const { output[0] = (*this)(input[0]); } diff --git a/library/src/rng/distribution/poisson.hpp b/library/src/rng/distribution/poisson.hpp index 8a7773f57..7976e29ee 100644 --- a/library/src/rng/distribution/poisson.hpp +++ b/library/src/rng/distribution/poisson.hpp @@ -60,14 +60,16 @@ class poisson_distribution : private discrete_distribution_base {} template - __forceinline__ __host__ __device__ unsigned int operator()(T x) const + __forceinline__ __host__ __device__ + unsigned int + operator()(T x) const { return base_t::operator()(x); } template - __forceinline__ __host__ __device__ void operator()(const T (&input)[1], - unsigned int (&output)[1]) const + __forceinline__ __host__ __device__ + void operator()(const T (&input)[1], unsigned int (&output)[1]) const { output[0] = (*this)(input[0]); } @@ -85,15 +87,17 @@ class poisson_distribution_huge {} template - __forceinline__ __host__ __device__ unsigned int operator()(T x) const + __forceinline__ __host__ __device__ + unsigned int + operator()(T x) const { const double normal_d = rocrand_device::detail::normal_distribution_double(x); return static_cast(round(m_sqrt_lambda * normal_d + m_lambda)); } template - __forceinline__ __host__ __device__ void operator()(const T (&input)[1], - unsigned int (&output)[1]) const + __forceinline__ __host__ __device__ + void operator()(const T (&input)[1], unsigned int (&output)[1]) const { output[0] = (*this)(input[0]); } @@ -103,7 +107,8 @@ class poisson_distribution_huge double m_sqrt_lambda; }; -[[nodiscard]] inline std::vector +[[nodiscard]] +inline std::vector calculate_poisson_probabilities(const double lambda, unsigned int& size, unsigned int& offset) { const size_t capacity = 2 * static_cast(16.0 * (2.0 + std::sqrt(lambda))); @@ -421,8 +426,8 @@ struct mrg_engine_poisson_distribution explicit mrg_engine_poisson_distribution(distribution_type dis) : dis(dis) {} - __forceinline__ __host__ __device__ void operator()(const unsigned int (&input)[1], - unsigned int (&output)[1]) const + __forceinline__ __host__ __device__ + void operator()(const unsigned int (&input)[1], unsigned int (&output)[1]) const { // Alias method requires x in [0, 1), uint must be in [0, UINT_MAX], // but MRG-based engine's "raw" output is in [1, MRG_M1], diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 6ded56c24..bf3d689f5 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -73,7 +73,8 @@ namespace rocrand_impl::host { /// Computes i % n, i must be in range [0, 2 * n) -__forceinline__ __device__ __host__ unsigned int wrap_n(unsigned int i) +__forceinline__ __device__ __host__ +unsigned int wrap_n(unsigned int i) { return i - (i < mt19937_constants::n ? 0 : mt19937_constants::n); } diff --git a/library/src/rng/mt19937_octo_engine.hpp b/library/src/rng/mt19937_octo_engine.hpp index c9a591cab..29a318e3b 100644 --- a/library/src/rng/mt19937_octo_engine.hpp +++ b/library/src/rng/mt19937_octo_engine.hpp @@ -136,8 +136,8 @@ struct mt19937_octo_engine static constexpr inline unsigned int i568 = 1 + items_per_thread * 10; /// Initialize the octo engine from the engine it shares with seven other threads. - __forceinline__ __device__ __host__ void gather(const unsigned int engine[mt19937_constants::n], - dim3 thread_idx) + __forceinline__ __device__ __host__ + void gather(const unsigned int engine[mt19937_constants::n], dim3 thread_idx) { constexpr unsigned int off_cnt = 11; /// Used to map the \p mt19937_octo_state.mt indices to \p mt19937_state.mt indices. @@ -183,8 +183,8 @@ struct mt19937_octo_engine return __shfl_up(val, 1, 8); } /// Calculates value of index \p i using values i, (i + 1) % n, and (i + m) % n. - static __forceinline__ __device__ __host__ unsigned int - comp(unsigned int mt_i, unsigned int mt_i_1, unsigned int mt_i_m) + static __forceinline__ __device__ __host__ + unsigned int comp(unsigned int mt_i, unsigned int mt_i_1, unsigned int mt_i_m) { const unsigned int y = (mt_i & mt19937_constants::upper_mask) | (mt_i_1 & mt19937_constants::lower_mask); @@ -220,10 +220,11 @@ struct mt19937_octo_engine m_state.mt[idx_i + j] = comp(m_state.mt[idx_i + j], last_dep, m_state.mt[idx_m + j]); } - __host__ static void comp_vector(unsigned int idx_i, - unsigned int idx_m, - unsigned int last_dep_tid_7, - mt19937_octo_engine (&thread_engines)[8]) + __host__ + static void comp_vector(unsigned int idx_i, + unsigned int idx_m, + unsigned int last_dep_tid_7, + mt19937_octo_engine (&thread_engines)[8]) { static constexpr unsigned int numberOfLanes = 8; // communicate the dependency for the last value @@ -549,13 +550,15 @@ struct mt19937_octo_engine } /// Return \p i state value without tempering - __forceinline__ __device__ __host__ unsigned int get(unsigned int i) const + __forceinline__ __device__ __host__ + unsigned int get(unsigned int i) const { return m_state.mt[i]; } /// Perform tempering on y - static __forceinline__ __device__ __host__ unsigned int temper(unsigned int y) + static __forceinline__ __device__ __host__ + unsigned int temper(unsigned int y) { constexpr unsigned int TEMPERING_MASK_B = 0x9D2C5680U; constexpr unsigned int TEMPERING_MASK_C = 0xEFC60000U; @@ -575,21 +578,21 @@ struct mt19937_octo_engine template struct mt19937_octo_engine_accessor { - __forceinline__ - __device__ __host__ explicit mt19937_octo_engine_accessor(unsigned int* _engines) + __forceinline__ __device__ __host__ explicit mt19937_octo_engine_accessor(unsigned int* _engines) : engines(_engines) {} /// Load one value \p i of the octo engine \p engine_id from global memory with coalesced /// access - __forceinline__ __device__ __host__ unsigned int load_value(unsigned int engine_id, - unsigned int i) const + __forceinline__ __device__ __host__ + unsigned int load_value(unsigned int engine_id, unsigned int i) const { return engines[i * stride + engine_id]; } /// Load the octo engine from global memory with coalesced access - __forceinline__ __device__ __host__ mt19937_octo_engine load(unsigned int engine_id) const + __forceinline__ __device__ __host__ + mt19937_octo_engine load(unsigned int engine_id) const { mt19937_octo_engine engine; #pragma unroll @@ -601,8 +604,8 @@ struct mt19937_octo_engine_accessor } /// Save the octo engine to global memory with coalesced access - __forceinline__ __device__ __host__ void save(unsigned int engine_id, - const mt19937_octo_engine& engine) const + __forceinline__ __device__ __host__ + void save(unsigned int engine_id, const mt19937_octo_engine& engine) const { #pragma unroll for(unsigned int i = 0; i < mt19937_constants::n / threads_per_generator; i++) diff --git a/library/src/rng/utils/cpp_utils.hpp b/library/src/rng/utils/cpp_utils.hpp index fdb63d8d3..b9682c57c 100644 --- a/library/src/rng/utils/cpp_utils.hpp +++ b/library/src/rng/utils/cpp_utils.hpp @@ -257,7 +257,8 @@ __host__ __device__ vec_wrapper(V) -> vec_wrapper; /// different overload sets for `::min` and `::max` between device and host code. That implementation /// can result in unwanted implicit conversions to `int` in host code. template -__host__ __device__ constexpr T max(const T& a, const T& b) +__host__ __device__ +constexpr T max(const T& a, const T& b) { return a < b ? b : a; } @@ -270,7 +271,8 @@ __host__ __device__ constexpr T max(const T& a, const T& b) /// different overload sets for `::min` and `::max` between device and host code. That implementation /// can result in unwanted implicit conversions to `int` in host code. template -__host__ __device__ constexpr T min(const T& a, const T& b) +__host__ __device__ +constexpr T min(const T& a, const T& b) { return a < b ? a : b; } From 846b18878664bbb398202517952be068b0dbcd10 Mon Sep 17 00:00:00 2001 From: Nara Prasetya Date: Tue, 2 Jul 2024 12:37:04 +0000 Subject: [PATCH 48/54] ci(.gitlab-ci.yml): replace 'ROCM_PATH' variable with 'env:HIP_PATH' as the former is unintuitive --- .gitlab-ci.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d1f591894..e37639d44 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -22,7 +22,6 @@ variables: GIT_SUBMODULE_STRATEGY: normal # Fetch submodules before job start (non-recursively) - ROCM_PATH: /opt/rocm include: - project: amd/ci-templates @@ -581,9 +580,9 @@ test:nvcc-parity: -D BUILD_SHARED_LIBS="$BUILD_SHARED_LIBS" -D BUILD_TEST=ON -D CMAKE_BUILD_TYPE=Release - -D CMAKE_CXX_COMPILER:FILEPATH="$ROCM_PATH/bin/clang++.exe" + -D CMAKE_CXX_COMPILER:FILEPATH="${env:HIP_PATH}/bin/clang++.exe" -D CMAKE_INSTALL_PREFIX:PATH="$CI_PROJECT_DIR/build/install" - -D CMAKE_PREFIX_PATH:PATH="$ROCM_PATH/lib/cmake" + -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}/lib/cmake" -D DEPENDENCIES_FORCE_DOWNLOAD=ON -D DISABLE_WERROR=OFF *>&1 # Building @@ -612,8 +611,8 @@ test:windows: -B "$CI_PROJECT_DIR/build_install_test" -G Ninja -D CMAKE_BUILD_TYPE=Release - -D CMAKE_CXX_COMPILER="$ROCM_PATH/bin/clang++.exe" - -D CMAKE_PREFIX_PATH="$ROCM_PATH/lib/cmake;$CI_PROJECT_DIR/build/install" *>&1 + -D CMAKE_CXX_COMPILER="${env:HIP_PATH}/bin/clang++.exe" + -D CMAKE_PREFIX_PATH="${env:HIP_PATH}/lib/cmake;$CI_PROJECT_DIR/build/install" *>&1 # Build package test - cmake --build "$CI_PROJECT_DIR/build_install_test" # Copy rocRAND.dll to the package test build directory From 578cfa7685249b40e974ff65e819bdd34a0092e1 Mon Sep 17 00:00:00 2001 From: Nara Prasetya Date: Tue, 2 Jul 2024 12:56:49 +0000 Subject: [PATCH 49/54] ci(.gitlab-ci.yml): do not force download deps on windows --- .gitlab-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e37639d44..7d2327964 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -583,7 +583,6 @@ test:nvcc-parity: -D CMAKE_CXX_COMPILER:FILEPATH="${env:HIP_PATH}/bin/clang++.exe" -D CMAKE_INSTALL_PREFIX:PATH="$CI_PROJECT_DIR/build/install" -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}/lib/cmake" - -D DEPENDENCIES_FORCE_DOWNLOAD=ON -D DISABLE_WERROR=OFF *>&1 # Building - cmake --build "$CI_PROJECT_DIR/build" *>&1 From 73dea65dcebc1909386e7a7eb7dbd6b17fd5d616 Mon Sep 17 00:00:00 2001 From: Nara Prasetya Date: Tue, 2 Jul 2024 13:45:58 +0000 Subject: [PATCH 50/54] ci(.gitlab-ci.yml): pass amdclang filepath properly to windows package build test --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7d2327964..b96c07e9d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -610,8 +610,8 @@ test:windows: -B "$CI_PROJECT_DIR/build_install_test" -G Ninja -D CMAKE_BUILD_TYPE=Release - -D CMAKE_CXX_COMPILER="${env:HIP_PATH}/bin/clang++.exe" - -D CMAKE_PREFIX_PATH="${env:HIP_PATH}/lib/cmake;$CI_PROJECT_DIR/build/install" *>&1 + -D CMAKE_CXX_COMPILER:FILEPATH="${env:HIP_PATH}/bin/clang++.exe" + -D CMAKE_PREFIX_PATH:FILEPATH="${env:HIP_PATH}/lib/cmake;$CI_PROJECT_DIR/build/install" *>&1 # Build package test - cmake --build "$CI_PROJECT_DIR/build_install_test" # Copy rocRAND.dll to the package test build directory From e1633011251d73ae374d232534a84926ffc01d75 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Fri, 12 Jul 2024 17:20:45 +0500 Subject: [PATCH 51/54] Remove unused FindTestU01.cmake --- README.md | 4 +-- cmake/Modules/FindTestU01.cmake | 52 --------------------------------- 2 files changed, 1 insertion(+), 55 deletions(-) delete mode 100644 cmake/Modules/FindTestU01.cmake diff --git a/README.md b/README.md index daeabdfd6..4a17574d1 100644 --- a/README.md +++ b/README.md @@ -235,6 +235,4 @@ Bugs and feature requests can be reported through the Contributions of any kind are most welcome! You can find more information at [CONTRIBUTING](./CONTRIBUTING.md). -Licensing information is located at [LICENSE](./LICENSE.txt). Note that [statistical tests](./test/crush) link -to the TestU01 library distributed under GNU General Public License (GPL) version 3. Therefore, the GPL -version 3 license applies to that part of the project. +Licensing information is located at [LICENSE](./LICENSE.txt). diff --git a/cmake/Modules/FindTestU01.cmake b/cmake/Modules/FindTestU01.cmake deleted file mode 100644 index d74f7e1fa..000000000 --- a/cmake/Modules/FindTestU01.cmake +++ /dev/null @@ -1,52 +0,0 @@ -if(NOT TESTU01_ROOT_DIR) - set(TESTU01_ROOT_DIR "/usr/local") - FIND_LIBRARY(testu01 NAMES testu01 PATHS ${TESTU01_ROOT_DIR}/lib) - FIND_LIBRARY(probdist NAMES probdist testu01probdist PATHS ${TESTU01_ROOT_DIR}/lib) - FIND_LIBRARY(mylib NAMES mylib testu01mylib PATHS ${TESTU01_ROOT_DIR}/lib) -else() - FIND_LIBRARY(testu01 NAMES testu01 PATHS ${TESTU01_ROOT_DIR}/lib NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH) - FIND_LIBRARY(probdist NAMES probdist testu01probdist PATHS ${TESTU01_ROOT_DIR}/lib NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH) - FIND_LIBRARY(mylib NAMES mylib testu01mylib PATHS ${TESTU01_ROOT_DIR}/lib NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH) -endif() -mark_as_advanced(TESTU01_ROOT_DIR) - -set(TestU01_ERROR_REASON "") -set(TestU01_FOUND TRUE) - -if(NOT testu01) - set(TestU01_FOUND 0) - string(APPEND TestU01_ERROR_REASON "TestU01 library not found") -endif() -if(NOT probdist) - set(TestU01_FOUND 0) - string(APPEND TestU01_ERROR_REASON "ProbDist library not found") -endif() -if(NOT mylib) - set(TestU01_FOUND 0) - string(APPEND TestU01_ERROR_REASON "MyLib library not found") -endif() - -find_path(TestU01_INCLUDE_DIRS bbattery.h PATH_SUFFIXES testu01 PATHS ${TESTU01_ROOT_DIR}/include) -if(NOT TestU01_INCLUDE_DIRS) - set(TestU01_FOUND 0) - string(APPEND TestU01_ERROR_REASON "TestU01 headers not found") -endif() -set(TestU01_LIBRARY_DIRS ${TESTU01_ROOT_DIR}/lib) - -if(TestU01_FOUND) - set(TestU01_LIBRARIES ${testu01} ${probdist} ${mylib}) - set(TestU01_INCLUDE_DIR ${TestU01_INCLUDE_DIRS}) - if(NOT TestU01_FIND_QUIETLY) - message(STATUS "Found TestU01: ${TestU01_LIBRARIES}") - endif() -else() - if(TestU01_FIND_REQUIRED) - message(SEND_ERROR "Unable to find the TestU01.\n${TestU01_ERROR_REASON}") - else() - if(NOT TestU01_FIND_QUIETLY) - if(TestU01_DEBUG OR TestU01_DETAILED_FAILURE_MSG) - message(STATUS "Could NOT find TestU01\n${TestU01_ERROR_REASON}") - endif() - endif() - endif() -endif() From f209572b5db1d7f4b80885e7f94daac3af242746 Mon Sep 17 00:00:00 2001 From: Nick Breed Date: Fri, 12 Jul 2024 09:51:37 +0000 Subject: [PATCH 52/54] Added checks for nullptr data with tests --- library/src/rng/lfsr113.hpp | 7 +++ library/src/rng/mrg.hpp | 5 +++ library/src/rng/mt19937.hpp | 5 +++ library/src/rng/mtgp32.hpp | 5 +++ library/src/rng/philox4x32_10.hpp | 5 +++ library/src/rng/sobol.hpp | 5 +++ library/src/rng/threefry.hpp | 5 +++ library/src/rng/xorwow.hpp | 5 +++ test/test_rocrand_generate.cpp | 54 +++++++++++------------ test/test_rocrand_generate_log_normal.cpp | 50 +++++++++------------ test/test_rocrand_generate_normal.cpp | 50 +++++++++------------ test/test_rocrand_generate_poisson.cpp | 3 ++ test/test_rocrand_generate_uniform.cpp | 50 +++++++++------------ 13 files changed, 137 insertions(+), 112 deletions(-) diff --git a/library/src/rng/lfsr113.hpp b/library/src/rng/lfsr113.hpp index 7b3348dfc..58b9702ba 100644 --- a/library/src/rng/lfsr113.hpp +++ b/library/src/rng/lfsr113.hpp @@ -383,7 +383,14 @@ class lfsr113_generator_template : public generator_impl_base generator_config config; const hipError_t error = ConfigProvider::template host_config(m_stream, m_order, config); if(error != hipSuccess) + { return ROCRAND_STATUS_INTERNAL_ERROR; + } + + if(data == nullptr) + { + return ROCRAND_STATUS_SUCCESS; + } status = dynamic_dispatch( m_order, diff --git a/library/src/rng/mrg.hpp b/library/src/rng/mrg.hpp index 4a98a8a30..2e464abcf 100644 --- a/library/src/rng/mrg.hpp +++ b/library/src/rng/mrg.hpp @@ -352,6 +352,11 @@ class mrg_generator_template : public generator_impl_base return ROCRAND_STATUS_INTERNAL_ERROR; } + if(data == nullptr) + { + return ROCRAND_STATUS_SUCCESS; + } + status = dynamic_dispatch( m_order, [&, this](auto is_dynamic) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index bf3d689f5..0cb7bd6be 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -930,6 +930,11 @@ class mt19937_generator_template : public generator_impl_base return ROCRAND_STATUS_INTERNAL_ERROR; } + if(data == nullptr) + { + return ROCRAND_STATUS_SUCCESS; + } + using vec_type = rocrand_impl::aligned_vec_type; const uintptr_t uintptr = reinterpret_cast(data); diff --git a/library/src/rng/mtgp32.hpp b/library/src/rng/mtgp32.hpp index feeadec94..83f2050fe 100644 --- a/library/src/rng/mtgp32.hpp +++ b/library/src/rng/mtgp32.hpp @@ -491,6 +491,11 @@ class mtgp32_generator_template : public generator_impl_base return ROCRAND_STATUS_INTERNAL_ERROR; } + if(data == nullptr) + { + return ROCRAND_STATUS_SUCCESS; + } + // The host generator uses a block of size one to emulate a device generator that uses a shared memory state const dim3 threads = std::is_same_v ? config.threads : dim3(1); diff --git a/library/src/rng/philox4x32_10.hpp b/library/src/rng/philox4x32_10.hpp index 04db05a16..e4f591458 100644 --- a/library/src/rng/philox4x32_10.hpp +++ b/library/src/rng/philox4x32_10.hpp @@ -320,6 +320,11 @@ class philox4x32_10_generator_template : public generator_impl_base return ROCRAND_STATUS_INTERNAL_ERROR; } + if(data == nullptr) + { + return ROCRAND_STATUS_SUCCESS; + } + status = dynamic_dispatch( m_order, [&, this](auto is_dynamic) diff --git a/library/src/rng/sobol.hpp b/library/src/rng/sobol.hpp index 4d849bca4..65aff0bde 100644 --- a/library/src/rng/sobol.hpp +++ b/library/src/rng/sobol.hpp @@ -636,6 +636,11 @@ class sobol_generator_template : public generator_impl_base return status; } + if(data == nullptr) + { + return ROCRAND_STATUS_SUCCESS; + } + constexpr uint32_t threads = 256; constexpr uint32_t max_blocks = 4096; constexpr uint32_t shared_mem_bytes diff --git a/library/src/rng/threefry.hpp b/library/src/rng/threefry.hpp index 476b7cccb..706374805 100644 --- a/library/src/rng/threefry.hpp +++ b/library/src/rng/threefry.hpp @@ -337,6 +337,11 @@ class threefry_generator_template : public generator_impl_base return ROCRAND_STATUS_INTERNAL_ERROR; } + if(data == nullptr) + { + return ROCRAND_STATUS_SUCCESS; + } + status = dynamic_dispatch(m_order, [&, this](auto is_dynamic) { diff --git a/library/src/rng/xorwow.hpp b/library/src/rng/xorwow.hpp index 659df06a0..908253a3b 100644 --- a/library/src/rng/xorwow.hpp +++ b/library/src/rng/xorwow.hpp @@ -350,6 +350,11 @@ class xorwow_generator_template : public generator_impl_base return ROCRAND_STATUS_INTERNAL_ERROR; } + if(data == nullptr) + { + return ROCRAND_STATUS_SUCCESS; + } + status = dynamic_dispatch(m_order, [&, this](auto is_dynamic) diff --git a/test/test_rocrand_generate.cpp b/test/test_rocrand_generate.cpp index 4acc876af..d431710c3 100644 --- a/test/test_rocrand_generate.cpp +++ b/test/test_rocrand_generate.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -47,20 +47,18 @@ TEST_P(rocrand_generate_tests, int_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate(generator, data, 1) - ); + ROCRAND_CHECK(rocrand_generate(generator, data, 1)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate(generator, data+1, 2) - ); + ROCRAND_CHECK(rocrand_generate(generator, data + 1, 2)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate(generator, data, size) - ); + ROCRAND_CHECK(rocrand_generate(generator, data, size)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate(generator, nullptr, size)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); @@ -85,20 +83,18 @@ TEST_P(rocrand_generate_tests, char_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_char(generator, data, 1) - ); + ROCRAND_CHECK(rocrand_generate_char(generator, data, 1)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_char(generator, data+1, 2) - ); + ROCRAND_CHECK(rocrand_generate_char(generator, data + 1, 2)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_char(generator, data, size) - ); + ROCRAND_CHECK(rocrand_generate_char(generator, data, size)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_char(generator, nullptr, size)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); @@ -123,20 +119,18 @@ TEST_P(rocrand_generate_tests, short_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_short(generator, data, 1) - ); + ROCRAND_CHECK(rocrand_generate_short(generator, data, 1)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_short(generator, data+1, 2) - ); + ROCRAND_CHECK(rocrand_generate_short(generator, data + 1, 2)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_short(generator, data, size) - ); + ROCRAND_CHECK(rocrand_generate_short(generator, data, size)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_short(generator, nullptr, size)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); @@ -185,6 +179,10 @@ TEST_P(rocrand_generate_long_long_tests, long_long_test) ROCRAND_CHECK(rocrand_generate_long_long(generator, data, size)); HIP_CHECK(hipDeviceSynchronize()); + // No output pointer + ROCRAND_CHECK(rocrand_generate_long_long(generator, nullptr, size)); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipFree(data)); ROCRAND_CHECK(rocrand_destroy_generator(generator)); } diff --git a/test/test_rocrand_generate_log_normal.cpp b/test/test_rocrand_generate_log_normal.cpp index 5f5f05328..7e688a038 100644 --- a/test/test_rocrand_generate_log_normal.cpp +++ b/test/test_rocrand_generate_log_normal.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -49,20 +49,18 @@ TEST_P(rocrand_generate_log_normal_tests, float_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_log_normal(generator, data, 1, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_log_normal(generator, data, 1, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_log_normal(generator, data+1, 2, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_log_normal(generator, data + 1, 2, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_log_normal(generator, data, size, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_log_normal(generator, data, size, mean, stddev)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_log_normal(generator, nullptr, size, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); @@ -89,20 +87,18 @@ TEST_P(rocrand_generate_log_normal_tests, double_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_log_normal_double(generator, data, 1, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_log_normal_double(generator, data, 1, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_log_normal_double(generator, data+1, 2, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_log_normal_double(generator, data + 1, 2, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_log_normal_double(generator, data, size, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_log_normal_double(generator, data, size, mean, stddev)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_log_normal_double(generator, nullptr, size, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); @@ -129,20 +125,18 @@ TEST_P(rocrand_generate_log_normal_tests, half_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_log_normal_half(generator, data, 1, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_log_normal_half(generator, data, 1, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_log_normal_half(generator, data+1, 2, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_log_normal_half(generator, data + 1, 2, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_log_normal_half(generator, data, size, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_log_normal_half(generator, data, size, mean, stddev)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_log_normal_half(generator, nullptr, size, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); diff --git a/test/test_rocrand_generate_normal.cpp b/test/test_rocrand_generate_normal.cpp index 5f02a2202..180f5f2c3 100644 --- a/test/test_rocrand_generate_normal.cpp +++ b/test/test_rocrand_generate_normal.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -49,20 +49,18 @@ TEST_P(rocrand_generate_normal_tests, float_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_normal(generator, data, 1, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_normal(generator, data, 1, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_normal(generator, data+1, 2, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_normal(generator, data + 1, 2, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_normal(generator, data, size, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_normal(generator, data, size, mean, stddev)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_normal(generator, nullptr, size, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); @@ -89,20 +87,18 @@ TEST_P(rocrand_generate_normal_tests, double_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_normal_double(generator, data, 1, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_normal_double(generator, data, 1, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_normal_double(generator, data+1, 2, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_normal_double(generator, data + 1, 2, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_normal_double(generator, data, size, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_normal_double(generator, data, size, mean, stddev)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_normal_double(generator, nullptr, size, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); @@ -129,20 +125,18 @@ TEST_P(rocrand_generate_normal_tests, half_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_normal_half(generator, data, 1, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_normal_half(generator, data, 1, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_normal_half(generator, data+1, 2, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_normal_half(generator, data + 1, 2, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_normal_half(generator, data, size, mean, stddev) - ); + ROCRAND_CHECK(rocrand_generate_normal_half(generator, data, size, mean, stddev)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_normal_half(generator, nullptr, size, mean, stddev)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); diff --git a/test/test_rocrand_generate_poisson.cpp b/test/test_rocrand_generate_poisson.cpp index 4c7befb4e..dc898bd2b 100644 --- a/test/test_rocrand_generate_poisson.cpp +++ b/test/test_rocrand_generate_poisson.cpp @@ -132,6 +132,9 @@ void test_multiple_lambdas(const rocrand_rng_type rng_type, const hipStream_t st HIP_CHECK(hipFree(d_ptr)); } + // No output pointer + ROCRAND_CHECK(rocrand_generate_poisson(generator, nullptr, size, lambdas[0])); + ROCRAND_CHECK(rocrand_destroy_generator(generator)); } diff --git a/test/test_rocrand_generate_uniform.cpp b/test/test_rocrand_generate_uniform.cpp index fd19a43ff..6d0f350f7 100644 --- a/test/test_rocrand_generate_uniform.cpp +++ b/test/test_rocrand_generate_uniform.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -47,20 +47,18 @@ TEST_P(rocrand_generate_uniform_tests, float_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_uniform(generator, data, 1) - ); + ROCRAND_CHECK(rocrand_generate_uniform(generator, data, 1)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_uniform(generator, data+1, 2) - ); + ROCRAND_CHECK(rocrand_generate_uniform(generator, data + 1, 2)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_uniform(generator, data, size) - ); + ROCRAND_CHECK(rocrand_generate_uniform(generator, data, size)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_uniform(generator, nullptr, size)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); @@ -85,20 +83,18 @@ TEST_P(rocrand_generate_uniform_tests, double_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_uniform_double(generator, data, 1) - ); + ROCRAND_CHECK(rocrand_generate_uniform_double(generator, data, 1)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_uniform_double(generator, data+1, 2) - ); + ROCRAND_CHECK(rocrand_generate_uniform_double(generator, data + 1, 2)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_uniform_double(generator, data, size) - ); + ROCRAND_CHECK(rocrand_generate_uniform_double(generator, data, size)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_uniform_double(generator, nullptr, size)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); @@ -123,20 +119,18 @@ TEST_P(rocrand_generate_uniform_tests, half_test) HIP_CHECK(hipDeviceSynchronize()); // Any sizes - ROCRAND_CHECK( - rocrand_generate_uniform_half(generator, data, 1) - ); + ROCRAND_CHECK(rocrand_generate_uniform_half(generator, data, 1)); HIP_CHECK(hipDeviceSynchronize()); // Any alignment - ROCRAND_CHECK( - rocrand_generate_uniform_half(generator, data+1, 2) - ); + ROCRAND_CHECK(rocrand_generate_uniform_half(generator, data + 1, 2)); HIP_CHECK(hipDeviceSynchronize()); - ROCRAND_CHECK( - rocrand_generate_uniform_half(generator, data, size) - ); + ROCRAND_CHECK(rocrand_generate_uniform_half(generator, data, size)); + HIP_CHECK(hipDeviceSynchronize()); + + // No output pointer + ROCRAND_CHECK(rocrand_generate_uniform_half(generator, nullptr, size)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(data)); From 49add413533cb75940517185a6da12e876ddd693 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Mon, 22 Jul 2024 15:59:33 +0500 Subject: [PATCH 53/54] Fix bit rotation for threefry2x64 and threefry4x64 operator >> has higher precedence than operator &. This bug causes very low quality in crush tests. --- CHANGELOG.md | 6 ++++-- library/include/rocrand/rocrand_threefry_common.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd736c772..4cdb1cdf4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,8 +17,10 @@ Documentation for rocRAND is available at ## (Unreleased) rocRAND-3.1.1 for ROCm 6.2.0 ## Fixes - * Fixed " unknown extension ?>" issue in scripts/config-tuning/select_best_config.py - when using python version thats older than 3.11 + +* Fixed " unknown extension ?>" issue in scripts/config-tuning/select_best_config.py + when using python version thats older than 3.11 +* Fixed low random sequence quality of `ROCRAND_RNG_PSEUDO_THREEFRY2_64_20` and `ROCRAND_RNG_PSEUDO_THREEFRY4_64_20`. ## (Unreleased) rocRAND-3.1.0 for ROCm 6.2.0 diff --git a/library/include/rocrand/rocrand_threefry_common.h b/library/include/rocrand/rocrand_threefry_common.h index 6fb707c2d..f6999cf4e 100644 --- a/library/include/rocrand/rocrand_threefry_common.h +++ b/library/include/rocrand/rocrand_threefry_common.h @@ -70,7 +70,7 @@ template<> __forceinline__ __device__ __host__ unsigned long long rotl(unsigned long long x, int d) { - return ((x << d) | (x >> (64 - d) & 63)); + return (x << (d & 63)) | (x >> ((64 - d) & 63)); }; template<> From ec409a3b2cfd95122be9b030ddf3622b4af5d9db Mon Sep 17 00:00:00 2001 From: Nara Prasetya Date: Thu, 8 Aug 2024 07:05:20 +0000 Subject: [PATCH 54/54] chore: bump version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 30076d6e2..83e88331f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -160,7 +160,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) endif() # Set version variables -rocm_setup_version( VERSION "3.1.0" ) +rocm_setup_version( VERSION "3.2.0" ) set ( rocrand_VERSION ${rocRAND_VERSION} ) # Old-style version number used within the library's API. rocrand_get_version should be modified. math(EXPR rocrand_VERSION_NUMBER "${rocRAND_VERSION_MAJOR} * 100000 + ${rocRAND_VERSION_MINOR} * 100 + ${rocRAND_VERSION_PATCH}")