diff --git a/CHANGELOG.md b/CHANGELOG.md index 08ff59a48..45fcaae6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ Documentation for rocRAND is available at [https://rocm.docs.amd.com/projects/rocRAND/en/latest/](https://rocm.docs.amd.com/projects/rocRAND/en/latest/) +## (unreleased) rocRAND 3.3.0 for ROCm 6.5 + +### Changed +* Updated several `gfx942` auto tuning parameters. + +### Fixed +* Fixed an issue where `mt19937.hpp` would cause kernel errors during auto tuning. + ## rocRAND 3.3.0 for ROCm 6.4 ### Added diff --git a/library/src/rng/config/lfsr113_config.hpp b/library/src/rng/config/lfsr113_config.hpp index 3556dc240..2231dc789 100644 --- a/library/src/rng/config/lfsr113_config.hpp +++ b/library/src/rng/config/lfsr113_config.hpp @@ -42,10 +42,11 @@ struct generator_config_selector case target_arch::gfx1101: return 128; case target_arch::gfx1100: return 64; case target_arch::gfx1030: return 64; - case target_arch::gfx942: return 512; + case target_arch::gfx942: return 256; case target_arch::gfx90a: return 64; case target_arch::gfx908: return 256; case target_arch::gfx906: return 256; + case target_arch::gfx1201: return 128; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 2048; case target_arch::gfx908: return 1024; case target_arch::gfx906: return 2048; + case target_arch::gfx1201: return 512; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config/mrg31k3p_config.hpp b/library/src/rng/config/mrg31k3p_config.hpp index 4b9021296..3f4179aee 100644 --- a/library/src/rng/config/mrg31k3p_config.hpp +++ b/library/src/rng/config/mrg31k3p_config.hpp @@ -46,6 +46,7 @@ struct generator_config_selector case target_arch::gfx90a: return 512; case target_arch::gfx908: return 1024; case target_arch::gfx906: return 256; + case target_arch::gfx1201: return 1024; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 2048; case target_arch::gfx908: return 600; case target_arch::gfx906: return 1792; + case target_arch::gfx1201: return 512; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config/mrg32k3a_config.hpp b/library/src/rng/config/mrg32k3a_config.hpp index 5857b7a11..2726d66cd 100644 --- a/library/src/rng/config/mrg32k3a_config.hpp +++ b/library/src/rng/config/mrg32k3a_config.hpp @@ -41,11 +41,12 @@ struct generator_config_selector case target_arch::gfx1102: return 128; case target_arch::gfx1101: return 128; case target_arch::gfx1100: return 128; - case target_arch::gfx942: return 256; + case target_arch::gfx942: return 1024; case target_arch::gfx90a: return 256; case target_arch::gfx1030: return 256; case target_arch::gfx908: return 1024; case target_arch::gfx906: return 256; + case target_arch::gfx1201: return 1024; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 2048; case target_arch::gfx908: return 600; case target_arch::gfx906: return 2048; + case target_arch::gfx1201: return 256; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config/mt19937_config.hpp b/library/src/rng/config/mt19937_config.hpp index 8a02a0d67..1cc472eeb 100644 --- a/library/src/rng/config/mt19937_config.hpp +++ b/library/src/rng/config/mt19937_config.hpp @@ -28,7 +28,7 @@ * This file is automatically generated by `/scripts/config-tuning/select_best_config.py`. */ -namespace rocrand_host::detail +namespace rocrand_impl::host { template @@ -41,9 +41,10 @@ struct generator_config_selector case target_arch::gfx1102: return 128; case target_arch::gfx1101: return 128; case target_arch::gfx1100: return 64; - case target_arch::gfx942: return 128; + case target_arch::gfx942: return 256; case target_arch::gfx90a: return 1024; case target_arch::gfx908: return 512; + case target_arch::gfx1201: return 64; default: return generator_config_defaults::threads; } @@ -59,6 +60,7 @@ struct generator_config_selector case target_arch::gfx942: return 1024; case target_arch::gfx90a: return 64; case target_arch::gfx908: return 64; + case target_arch::gfx1201: return 512; default: return generator_config_defaults::blocks; } @@ -67,4 +69,4 @@ struct generator_config_selector } // end namespace rocrand_host::detail -#endif // ROCRAND_RNG_CONFIG_MT19937_HPP_ +#endif // ROCRAND_RNG_CONFIG_MT19937_HPP_ \ No newline at end of file diff --git a/library/src/rng/config/mtgp32_config.hpp b/library/src/rng/config/mtgp32_config.hpp index f915dbac9..6df560d3c 100644 --- a/library/src/rng/config/mtgp32_config.hpp +++ b/library/src/rng/config/mtgp32_config.hpp @@ -46,6 +46,7 @@ struct generator_config_selector case target_arch::gfx90a: return 256; case target_arch::gfx908: return 256; case target_arch::gfx906: return 256; + case target_arch::gfx1201: return 256; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 512; case target_arch::gfx908: return 480; case target_arch::gfx906: return 448; + case target_arch::gfx1201: return 512; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config/philox4_32_10_config.hpp b/library/src/rng/config/philox4_32_10_config.hpp index abc4e1bdc..bb111482d 100644 --- a/library/src/rng/config/philox4_32_10_config.hpp +++ b/library/src/rng/config/philox4_32_10_config.hpp @@ -42,10 +42,11 @@ struct generator_config_selector case target_arch::gfx1101: return 1024; case target_arch::gfx1100: return 512; case target_arch::gfx1030: return 1024; - case target_arch::gfx942: return 1024; + case target_arch::gfx942: return 512; case target_arch::gfx90a: return 512; case target_arch::gfx908: return 512; case target_arch::gfx906: return 64; + case target_arch::gfx1201: return 1024; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 2048; case target_arch::gfx908: return 3840; case target_arch::gfx906: return 896; + case target_arch::gfx1201: return 1024; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config/threefry2_32_20_config.hpp b/library/src/rng/config/threefry2_32_20_config.hpp index 9f614754d..b81655ae1 100644 --- a/library/src/rng/config/threefry2_32_20_config.hpp +++ b/library/src/rng/config/threefry2_32_20_config.hpp @@ -42,10 +42,11 @@ struct generator_config_selector case target_arch::gfx1101: return 256; case target_arch::gfx1100: return 1024; case target_arch::gfx1030: return 256; - case target_arch::gfx942: return 256; + case target_arch::gfx942: return 512; case target_arch::gfx90a: return 512; case target_arch::gfx908: return 512; case target_arch::gfx906: return 256; + case target_arch::gfx1201: return 1024; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 2048; case target_arch::gfx908: return 1200; case target_arch::gfx906: return 896; + case target_arch::gfx1201: return 2048; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config/threefry2_64_20_config.hpp b/library/src/rng/config/threefry2_64_20_config.hpp index c1b3b6e37..ec7f718bc 100644 --- a/library/src/rng/config/threefry2_64_20_config.hpp +++ b/library/src/rng/config/threefry2_64_20_config.hpp @@ -46,6 +46,7 @@ struct generator_config_selector case target_arch::gfx90a: return 256; case target_arch::gfx908: return 256; case target_arch::gfx906: return 256; + case target_arch::gfx1201: return 512; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 2048; case target_arch::gfx908: return 960; case target_arch::gfx906: return 560; + case target_arch::gfx1201: return 1024; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config/threefry4_32_20_config.hpp b/library/src/rng/config/threefry4_32_20_config.hpp index c2e2cf8ec..4cb206afa 100644 --- a/library/src/rng/config/threefry4_32_20_config.hpp +++ b/library/src/rng/config/threefry4_32_20_config.hpp @@ -42,10 +42,11 @@ struct generator_config_selector case target_arch::gfx1101: return 512; case target_arch::gfx1100: return 1024; case target_arch::gfx1030: return 1024; - case target_arch::gfx942: return 1024; + case target_arch::gfx942: return 512; case target_arch::gfx90a: return 256; case target_arch::gfx908: return 256; case target_arch::gfx906: return 256; + case target_arch::gfx1201: return 1024; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 2048; case target_arch::gfx908: return 1200; case target_arch::gfx906: return 896; + case target_arch::gfx1201: return 1024; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config/threefry4_64_20_config.hpp b/library/src/rng/config/threefry4_64_20_config.hpp index 636e886d0..b46aa5cd2 100644 --- a/library/src/rng/config/threefry4_64_20_config.hpp +++ b/library/src/rng/config/threefry4_64_20_config.hpp @@ -46,6 +46,7 @@ struct generator_config_selector case target_arch::gfx90a: return 256; case target_arch::gfx908: return 128; case target_arch::gfx906: return 128; + case target_arch::gfx1201: return 128; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 2048; case target_arch::gfx908: return 1920; case target_arch::gfx906: return 1792; + case target_arch::gfx1201: return 2048; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config/xorwow_config.hpp b/library/src/rng/config/xorwow_config.hpp index 146573638..44a31a5b6 100644 --- a/library/src/rng/config/xorwow_config.hpp +++ b/library/src/rng/config/xorwow_config.hpp @@ -46,6 +46,7 @@ struct generator_config_selector case target_arch::gfx90a: return 128; case target_arch::gfx908: return 256; case target_arch::gfx906: return 256; + case target_arch::gfx1201: return 1024; default: return generator_config_defaults::threads; } @@ -63,6 +64,7 @@ struct generator_config_selector case target_arch::gfx90a: return 2048; case target_arch::gfx908: return 600; case target_arch::gfx906: return 560; + case target_arch::gfx1201: return 256; default: return generator_config_defaults::blocks; } diff --git a/library/src/rng/config_types.hpp b/library/src/rng/config_types.hpp index cd0a1f6a6..587052e38 100644 --- a/library/src/rng/config_types.hpp +++ b/library/src/rng/config_types.hpp @@ -53,6 +53,7 @@ enum class target_arch : unsigned int gfx1100 = 1100, gfx1101 = 1101, gfx1102 = 1102, + gfx1201 = 1201, unknown = std::numeric_limits::max(), }; @@ -85,6 +86,8 @@ __host__ __device__ constexpr target_arch get_device_arch() return target_arch::gfx1101; #elif defined(__gfx1102__) return target_arch::gfx1102; +#elif defined(__gfx1201__) + return target_arch::gfx1201; #else return target_arch::unknown; #endif @@ -106,7 +109,8 @@ inline target_arch parse_gcn_arch(const std::string& arch_name) "gfx1030", "gfx1100", "gfx1101", - "gfx1102"}; + "gfx1102", + "gfx1201"}; const target_arch target_architectures[] = { target_arch::gfx900, target_arch::gfx902, @@ -120,6 +124,7 @@ inline target_arch parse_gcn_arch(const std::string& arch_name) target_arch::gfx1100, target_arch::gfx1101, target_arch::gfx1102, + target_arch::gfx1201, }; static_assert(sizeof(target_names) / sizeof(target_names[0]) == sizeof(target_architectures) / sizeof(target_architectures[0]), @@ -320,6 +325,10 @@ hipError_t get_generator_config(const hipStream_t stream, { target_arch current_arch; const hipError_t error = get_device_arch(stream, current_arch); + printf("%d\n", current_arch); + printf("%d\n", current_arch); + printf("%d\n", current_arch); + printf("%d\n", current_arch); if(error != hipSuccess) { return error; @@ -329,6 +338,12 @@ hipError_t get_generator_config(const hipStream_t stream, } else { + printf("IN ELSE OF IS_ORDERING_DYNAMIC\n"); + printf("IN ELSE OF IS_ORDERING_DYNAMIC\n"); + printf("IN ELSE OF IS_ORDERING_DYNAMIC\n"); + printf("IN ELSE OF IS_ORDERING_DYNAMIC\n"); + printf("IN ELSE OF IS_ORDERING_DYNAMIC\n"); + config.threads = generator_config_defaults::threads; config.blocks = generator_config_defaults::blocks; } diff --git a/library/src/rng/lfsr113.hpp b/library/src/rng/lfsr113.hpp index 58b9702ba..6da57058a 100644 --- a/library/src/rng/lfsr113.hpp +++ b/library/src/rng/lfsr113.hpp @@ -392,6 +392,7 @@ class lfsr113_generator_template : public generator_impl_base return ROCRAND_STATUS_SUCCESS; } + printf("Blocks: %d ---- Threads: %d\n", config.blocks, config.threads); status = dynamic_dispatch( m_order, [&, this](auto is_dynamic) diff --git a/library/src/rng/mt19937.hpp b/library/src/rng/mt19937.hpp index 0cb7bd6be..f17fe70e6 100644 --- a/library/src/rng/mt19937.hpp +++ b/library/src/rng/mt19937.hpp @@ -50,6 +50,7 @@ #ifndef ROCRAND_RNG_MT19937_H_ #define ROCRAND_RNG_MT19937_H_ +#include "config/mt19937_config.hpp" #include "common.hpp" #include "config_types.hpp" @@ -874,18 +875,19 @@ class mt19937_generator_template : public generator_impl_base system_type::free(d_mt19937_jump); // This kernel is not actually tuned for ordering, but config is needed for device-side compile time check of the generator count - dynamic_dispatch(m_order, - [&, this](auto is_dynamic) - { - status = system_type::template launch< - init_engines_mt19937>( - dim3(config.blocks), - dim3(config.threads), - 0, - m_stream, - m_engines, - d_engines); - }); + dynamic_dispatch( + m_order, + [&, this](auto is_dynamic) + { + status + = system_type::template launch, + ConfigProvider>(dim3(config.blocks), + dim3(config.threads), + 0, + m_stream, + m_engines, + d_engines); + }); if(status != ROCRAND_STATUS_SUCCESS) { system_type::free(d_engines); @@ -983,20 +985,22 @@ class mt19937_generator_template : public generator_impl_base is_dynamic, T, vec_type, - Distribution>>( - dim3(config.blocks), - dim3(config.threads), - 0, - m_stream, - m_engines, - m_start_input, - data, - size, - vec_data, - vec_size, - head_size, - tail_size, - distribution); + Distribution>, + ConfigProvider, + T, + is_dynamic>(dim3(config.blocks), + dim3(config.threads), + 0, + m_stream, + m_engines, + m_start_input, + data, + size, + vec_data, + vec_size, + head_size, + tail_size, + distribution); }); if(status != ROCRAND_STATUS_SUCCESS) { @@ -1014,20 +1018,22 @@ class mt19937_generator_template : public generator_impl_base is_dynamic, T, vec_type, - Distribution>>( - dim3(config.blocks), - dim3(config.threads), - 0, - m_stream, - m_engines, - m_start_input, - data, - size, - vec_data, - vec_size, - head_size, - tail_size, - distribution); + Distribution>, + ConfigProvider, + T, + is_dynamic>(dim3(config.blocks), + dim3(config.threads), + 0, + m_stream, + m_engines, + m_start_input, + data, + size, + vec_data, + vec_size, + head_size, + tail_size, + distribution); }); if(status != ROCRAND_STATUS_SUCCESS) { @@ -1122,4 +1128,4 @@ using mt19937_generator_host } // namespace rocrand_impl::host -#endif // ROCRAND_RNG_MT19937_H_ +#endif // ROCRAND_RNG_MT19937_H_ \ No newline at end of file