From a7c9b690d5864d64099ddd530e942173d3963713 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 11:29:40 -0400 Subject: [PATCH 01/44] Add GPU support to airy functions --- include/boost/math/special_functions/airy.hpp | 52 +++++++++++-------- .../boost/math/special_functions/math_fwd.hpp | 48 ++++++++--------- 2 files changed, 53 insertions(+), 47 deletions(-) diff --git a/include/boost/math/special_functions/airy.hpp b/include/boost/math/special_functions/airy.hpp index 06eee9238..65114089a 100644 --- a/include/boost/math/special_functions/airy.hpp +++ b/include/boost/math/special_functions/airy.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2012. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt @@ -7,19 +8,24 @@ #ifndef BOOST_MATH_AIRY_HPP #define BOOST_MATH_AIRY_HPP -#include +#include +#include +#include +#include #include #include #include #include #include +#include +#include namespace boost{ namespace math{ namespace detail{ template -T airy_ai_imp(T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_ai_imp(T x, const Policy& pol) { BOOST_MATH_STD_USING @@ -57,7 +63,7 @@ T airy_ai_imp(T x, const Policy& pol) } template -T airy_bi_imp(T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_bi_imp(T x, const Policy& pol) { BOOST_MATH_STD_USING @@ -90,7 +96,7 @@ T airy_bi_imp(T x, const Policy& pol) } template -T airy_ai_prime_imp(T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_ai_prime_imp(T x, const Policy& pol) { BOOST_MATH_STD_USING @@ -125,7 +131,7 @@ T airy_ai_prime_imp(T x, const Policy& pol) } template -T airy_bi_prime_imp(T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_bi_prime_imp(T x, const Policy& pol) { BOOST_MATH_STD_USING @@ -156,7 +162,7 @@ T airy_bi_prime_imp(T x, const Policy& pol) } template -T airy_ai_zero_imp(int m, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_ai_zero_imp(int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for log, sqrt. @@ -209,7 +215,7 @@ T airy_ai_zero_imp(int m, const Policy& pol) } template -T airy_bi_zero_imp(int m, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_bi_zero_imp(int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for log, sqrt. @@ -263,7 +269,7 @@ T airy_bi_zero_imp(int m, const Policy& pol) } // namespace detail template -inline typename tools::promote_args::type airy_ai(T x, const Policy&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_ai(T x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename tools::promote_args::type result_type; @@ -279,13 +285,13 @@ inline typename tools::promote_args::type airy_ai(T x, const Policy&) } template -inline typename tools::promote_args::type airy_ai(T x) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_ai(T x) { return airy_ai(x, policies::policy<>()); } template -inline typename tools::promote_args::type airy_bi(T x, const Policy&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_bi(T x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename tools::promote_args::type result_type; @@ -301,13 +307,13 @@ inline typename tools::promote_args::type airy_bi(T x, const Policy&) } template -inline typename tools::promote_args::type airy_bi(T x) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_bi(T x) { return airy_bi(x, policies::policy<>()); } template -inline typename tools::promote_args::type airy_ai_prime(T x, const Policy&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_ai_prime(T x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename tools::promote_args::type result_type; @@ -323,13 +329,13 @@ inline typename tools::promote_args::type airy_ai_prime(T x, const Policy&) } template -inline typename tools::promote_args::type airy_ai_prime(T x) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_ai_prime(T x) { return airy_ai_prime(x, policies::policy<>()); } template -inline typename tools::promote_args::type airy_bi_prime(T x, const Policy&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_bi_prime(T x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename tools::promote_args::type result_type; @@ -345,13 +351,13 @@ inline typename tools::promote_args::type airy_bi_prime(T x, const Policy&) } template -inline typename tools::promote_args::type airy_bi_prime(T x) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_bi_prime(T x) { return airy_bi_prime(x, policies::policy<>()); } template -inline T airy_ai_zero(int m, const Policy& /*pol*/) +BOOST_MATH_GPU_ENABLED inline T airy_ai_zero(int m, const Policy& /*pol*/) { BOOST_FPU_EXCEPTION_GUARD typedef typename policies::evaluation::type value_type; @@ -371,13 +377,13 @@ inline T airy_ai_zero(int m, const Policy& /*pol*/) } template -inline T airy_ai_zero(int m) +BOOST_MATH_GPU_ENABLED inline T airy_ai_zero(int m) { return airy_ai_zero(m, policies::policy<>()); } template -inline OutputIterator airy_ai_zero( +BOOST_MATH_GPU_ENABLED inline OutputIterator airy_ai_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it, @@ -399,7 +405,7 @@ inline OutputIterator airy_ai_zero( } template -inline OutputIterator airy_ai_zero( +BOOST_MATH_GPU_ENABLED inline OutputIterator airy_ai_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it) @@ -408,7 +414,7 @@ inline OutputIterator airy_ai_zero( } template -inline T airy_bi_zero(int m, const Policy& /*pol*/) +BOOST_MATH_GPU_ENABLED inline T airy_bi_zero(int m, const Policy& /*pol*/) { BOOST_FPU_EXCEPTION_GUARD typedef typename policies::evaluation::type value_type; @@ -428,13 +434,13 @@ inline T airy_bi_zero(int m, const Policy& /*pol*/) } template -inline T airy_bi_zero(int m) +BOOST_MATH_GPU_ENABLED inline T airy_bi_zero(int m) { return airy_bi_zero(m, policies::policy<>()); } template -inline OutputIterator airy_bi_zero( +BOOST_MATH_GPU_ENABLED inline OutputIterator airy_bi_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it, @@ -456,7 +462,7 @@ inline OutputIterator airy_bi_zero( } template -inline OutputIterator airy_bi_zero( +BOOST_MATH_GPU_ENABLED inline OutputIterator airy_bi_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it) diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 22c388949..2f4ccd5e7 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -816,58 +816,58 @@ namespace boost std::complex >::result_type> sph_hankel_2(T1 v, T2 x); template - tools::promote_args_t airy_ai(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai(T x, const Policy&); template - tools::promote_args_t airy_ai(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai(T x); template - tools::promote_args_t airy_bi(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_bi(T x, const Policy&); template - tools::promote_args_t airy_bi(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_bi(T x); template - tools::promote_args_t airy_ai_prime(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai_prime(T x, const Policy&); template - tools::promote_args_t airy_ai_prime(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai_prime(T x); template - tools::promote_args_t airy_bi_prime(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_bi_prime(T x, const Policy&); template - tools::promote_args_t airy_bi_prime(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_bi_prime(T x); template - T airy_ai_zero(int m); + BOOST_MATH_GPU_ENABLED T airy_ai_zero(int m); template - T airy_ai_zero(int m, const Policy&); + BOOST_MATH_GPU_ENABLED T airy_ai_zero(int m, const Policy&); template - OutputIterator airy_ai_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it); template - OutputIterator airy_ai_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it, const Policy&); template - T airy_bi_zero(int m); + BOOST_MATH_GPU_ENABLED T airy_bi_zero(int m); template - T airy_bi_zero(int m, const Policy&); + BOOST_MATH_GPU_ENABLED T airy_bi_zero(int m, const Policy&); template - OutputIterator airy_bi_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it); template - OutputIterator airy_bi_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it, @@ -1769,33 +1769,33 @@ template \ { return boost::math::jacobi_theta4m1tau(z, q, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t airy_ai(T x)\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t airy_ai(T x)\ { return boost::math::airy_ai(x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t airy_bi(T x)\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t airy_bi(T x)\ { return boost::math::airy_bi(x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t airy_ai_prime(T x)\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t airy_ai_prime(T x)\ { return boost::math::airy_ai_prime(x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t airy_bi_prime(T x)\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t airy_bi_prime(T x)\ { return boost::math::airy_bi_prime(x, Policy()); }\ \ template \ - inline T airy_ai_zero(int m)\ + BOOST_MATH_GPU_ENABLED inline T airy_ai_zero(int m)\ { return boost::math::airy_ai_zero(m, Policy()); }\ template \ - OutputIterator airy_ai_zero(int start_index, unsigned number_of_zeros, OutputIterator out_it)\ + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero(int start_index, unsigned number_of_zeros, OutputIterator out_it)\ { return boost::math::airy_ai_zero(start_index, number_of_zeros, out_it, Policy()); }\ \ template \ - inline T airy_bi_zero(int m)\ + BOOST_MATH_GPU_ENABLED inline T airy_bi_zero(int m)\ { return boost::math::airy_bi_zero(m, Policy()); }\ template \ - OutputIterator airy_bi_zero(int start_index, unsigned number_of_zeros, OutputIterator out_it)\ + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero(int start_index, unsigned number_of_zeros, OutputIterator out_it)\ { return boost::math::airy_bi_zero(start_index, number_of_zeros, out_it, Policy()); }\ \ template \ From 4ac616f7547b6c5d600e264174f7edc9388d757e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 11:29:53 -0400 Subject: [PATCH 02/44] Add SYCL testing of ariy functions --- test/sycl_jamfile | 2 ++ test/test_airy.cpp | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 162603108..9c8978a4e 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -46,6 +46,8 @@ run test_weibull.cpp ; # Special Functions run pow_test.cpp ; +run test_airy.cpp ; + run test_beta_simple.cpp ; run test_beta.cpp ; run test_ibeta.cpp ; diff --git a/test/test_airy.cpp b/test/test_airy.cpp index d42fbb4ca..548d4de15 100644 --- a/test/test_airy.cpp +++ b/test/test_airy.cpp @@ -3,14 +3,21 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error #define BOOST_TEST_MAIN #include #include #include +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #include #include #include @@ -50,6 +57,11 @@ void test_airy(T, const char* name) T tol = boost::math::tools::epsilon() * 800; if ((std::numeric_limits::digits > 100) || (std::numeric_limits::digits == 0)) tol *= 2; + + #ifdef SYCL_LANGUAGE_VERSION + tol *= 5; + #endif + for(unsigned i = 0; i < data.size(); ++i) { BOOST_CHECK_CLOSE_FRACTION(data[i][1], boost::math::airy_ai(data[i][0]), tol); From 6772f171e817f204375380e677bd21050e085af5 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 12:02:51 -0400 Subject: [PATCH 03/44] Add CUDA testing of airy functions --- test/cuda_jamfile | 9 +++ test/test_airy_ai_double.cu | 100 ++++++++++++++++++++++++++++++ test/test_airy_ai_float.cu | 100 ++++++++++++++++++++++++++++++ test/test_airy_ai_prime_double.cu | 100 ++++++++++++++++++++++++++++++ test/test_airy_ai_prime_float.cu | 100 ++++++++++++++++++++++++++++++ test/test_airy_bi_double.cu | 100 ++++++++++++++++++++++++++++++ test/test_airy_bi_float.cu | 100 ++++++++++++++++++++++++++++++ test/test_airy_bi_prime_double.cu | 100 ++++++++++++++++++++++++++++++ test/test_airy_bi_prime_float.cu | 100 ++++++++++++++++++++++++++++++ 9 files changed, 809 insertions(+) create mode 100644 test/test_airy_ai_double.cu create mode 100644 test/test_airy_ai_float.cu create mode 100644 test/test_airy_ai_prime_double.cu create mode 100644 test/test_airy_ai_prime_float.cu create mode 100644 test/test_airy_bi_double.cu create mode 100644 test/test_airy_bi_float.cu create mode 100644 test/test_airy_bi_prime_double.cu create mode 100644 test/test_airy_bi_prime_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 0a32ef099..0e4ead125 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -245,6 +245,15 @@ run test_weibull_quan_double.cu ; run test_weibull_quan_float.cu ; # Special Functions +run test_airy_ai_double.cu ; +run test_airy_ai_float.cu ; +run test_airy_ai_prime_double.cu ; +run test_airy_ai_prime_float.cu ; +run test_airy_bi_double.cu ; +run test_airy_bi_float.cu ; +run test_airy_bi_prime_double.cu ; +run test_airy_bi_prime_float.cu ; + run test_beta_double.cu ; run test_beta_float.cu ; run test_betac_double.cu ; diff --git a/test/test_airy_ai_double.cu b/test/test_airy_ai_double.cu new file mode 100644 index 000000000..fad46bd9d --- /dev/null +++ b/test/test_airy_ai_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_ai(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_ai(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_ai_float.cu b/test/test_airy_ai_float.cu new file mode 100644 index 000000000..b9149aec3 --- /dev/null +++ b/test/test_airy_ai_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_ai(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_ai(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_ai_prime_double.cu b/test/test_airy_ai_prime_double.cu new file mode 100644 index 000000000..1a6bcd710 --- /dev/null +++ b/test/test_airy_ai_prime_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_ai_prime(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_ai_prime(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_ai_prime_float.cu b/test/test_airy_ai_prime_float.cu new file mode 100644 index 000000000..df690c2b1 --- /dev/null +++ b/test/test_airy_ai_prime_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_ai_prime(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_ai_prime(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_bi_double.cu b/test/test_airy_bi_double.cu new file mode 100644 index 000000000..60001a3fe --- /dev/null +++ b/test/test_airy_bi_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_bi(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_bi(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_bi_float.cu b/test/test_airy_bi_float.cu new file mode 100644 index 000000000..ed729bfe7 --- /dev/null +++ b/test/test_airy_bi_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_bi(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_bi(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_bi_prime_double.cu b/test/test_airy_bi_prime_double.cu new file mode 100644 index 000000000..a73e43f25 --- /dev/null +++ b/test/test_airy_bi_prime_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_bi_prime(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_bi_prime(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_bi_prime_float.cu b/test/test_airy_bi_prime_float.cu new file mode 100644 index 000000000..36874bccc --- /dev/null +++ b/test/test_airy_bi_prime_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_bi_prime(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_bi_prime(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 1ad8cabd4d5e08c16b5ec9e220b3556f547d1cac Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 12:11:36 -0400 Subject: [PATCH 04/44] Add NVRTC testing of airy functions --- test/nvrtc_jamfile | 9 ++ test/test_airy_ai_nvrtc_double.cpp | 190 +++++++++++++++++++++++ test/test_airy_ai_nvrtc_float.cpp | 190 +++++++++++++++++++++++ test/test_airy_ai_prime_nvrtc_double.cpp | 190 +++++++++++++++++++++++ test/test_airy_ai_prime_nvrtc_float.cpp | 190 +++++++++++++++++++++++ test/test_airy_bi_nvrtc_double.cpp | 190 +++++++++++++++++++++++ test/test_airy_bi_nvrtc_float.cpp | 190 +++++++++++++++++++++++ test/test_airy_bi_prime_nvrtc_double.cpp | 190 +++++++++++++++++++++++ test/test_airy_bi_prime_nvrtc_float.cpp | 190 +++++++++++++++++++++++ 9 files changed, 1529 insertions(+) create mode 100644 test/test_airy_ai_nvrtc_double.cpp create mode 100644 test/test_airy_ai_nvrtc_float.cpp create mode 100644 test/test_airy_ai_prime_nvrtc_double.cpp create mode 100644 test/test_airy_ai_prime_nvrtc_float.cpp create mode 100644 test/test_airy_bi_nvrtc_double.cpp create mode 100644 test/test_airy_bi_nvrtc_float.cpp create mode 100644 test/test_airy_bi_prime_nvrtc_double.cpp create mode 100644 test/test_airy_bi_prime_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 46dd0e257..bab6596fa 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -241,6 +241,15 @@ run test_weibull_quan_nvrtc_double.cpp ; run test_weibull_quan_nvrtc_float.cpp ; # Special Functions +run test_airy_ai_nvrtc_double.cpp ; +run test_airy_ai_nvrtc_float.cpp ; +run test_airy_ai_prime_nvrtc_double.cpp ; +run test_airy_ai_prime_nvrtc_float.cpp ; +run test_airy_bi_nvrtc_double.cpp ; +run test_airy_bi_nvrtc_float.cpp ; +run test_airy_bi_prime_nvrtc_double.cpp ; +run test_airy_bi_prime_nvrtc_float.cpp ; + run test_beta_nvrtc_double.cpp ; run test_beta_nvrtc_float.cpp ; run test_betac_nvrtc_double.cpp ; diff --git a/test/test_airy_ai_nvrtc_double.cpp b/test/test_airy_ai_nvrtc_double.cpp new file mode 100644 index 000000000..1b918cfef --- /dev/null +++ b/test/test_airy_ai_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_airy_ai_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_ai(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_ai_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_ai_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_ai_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_ai(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_ai_nvrtc_float.cpp b/test/test_airy_ai_nvrtc_float.cpp new file mode 100644 index 000000000..695730642 --- /dev/null +++ b/test/test_airy_ai_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_airy_ai_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_ai(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_ai_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_ai_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_ai_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_ai(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_ai_prime_nvrtc_double.cpp b/test/test_airy_ai_prime_nvrtc_double.cpp new file mode 100644 index 000000000..101257176 --- /dev/null +++ b/test/test_airy_ai_prime_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_airy_ai_prime_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_ai_prime(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_ai_prime_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_ai_prime_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_ai_prime_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_ai_prime(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_ai_prime_nvrtc_float.cpp b/test/test_airy_ai_prime_nvrtc_float.cpp new file mode 100644 index 000000000..c96e04449 --- /dev/null +++ b/test/test_airy_ai_prime_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_airy_ai_prime_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_ai_prime(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_ai_prime_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_ai_prime_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_ai_prime_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_ai_prime(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_bi_nvrtc_double.cpp b/test/test_airy_bi_nvrtc_double.cpp new file mode 100644 index 000000000..f69e23916 --- /dev/null +++ b/test/test_airy_bi_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_airy_bi_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_bi(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_bi_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_bi_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_bi_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_bi(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_bi_nvrtc_float.cpp b/test/test_airy_bi_nvrtc_float.cpp new file mode 100644 index 000000000..c28a5f5eb --- /dev/null +++ b/test/test_airy_bi_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_airy_bi_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_bi(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_bi_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_bi_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_bi_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_bi(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_bi_prime_nvrtc_double.cpp b/test/test_airy_bi_prime_nvrtc_double.cpp new file mode 100644 index 000000000..802f63a29 --- /dev/null +++ b/test/test_airy_bi_prime_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_airy_bi_prime_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_bi_prime(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_bi_prime_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_bi_prime_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_bi_prime_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_bi_prime(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_bi_prime_nvrtc_float.cpp b/test/test_airy_bi_prime_nvrtc_float.cpp new file mode 100644 index 000000000..e96aa48b9 --- /dev/null +++ b/test/test_airy_bi_prime_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_airy_bi_prime_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_bi_prime(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_bi_prime_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_bi_prime_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_bi_prime_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_bi_prime(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From d44783441d2e06c7c057fed2093fb9ec6f728b29 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 13:54:03 -0400 Subject: [PATCH 05/44] Add GPU support to ellint rc --- include/boost/math/special_functions/ellint_rc.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/boost/math/special_functions/ellint_rc.hpp b/include/boost/math/special_functions/ellint_rc.hpp index 2f9a1f8cf..ae3c6375e 100644 --- a/include/boost/math/special_functions/ellint_rc.hpp +++ b/include/boost/math/special_functions/ellint_rc.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang, 2015 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -18,12 +19,11 @@ #pragma once #endif -#include #include +#include #include #include #include -#include // Carlson's degenerate elliptic integral // R_C(x, y) = R_F(x, y, y) = 0.5 * \int_{0}^{\infty} (t+x)^{-1/2} (t+y)^{-1} dt @@ -32,11 +32,11 @@ namespace boost { namespace math { namespace detail{ template -T ellint_rc_imp(T x, T y, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_rc_imp(T x, T y, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::ellint_rc<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rc<%1%>(%1%,%1%)"; if(x < 0) { @@ -88,7 +88,7 @@ T ellint_rc_imp(T x, T y, const Policy& pol) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rc(T1 x, T2 y, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -100,7 +100,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rc(T1 x, T2 y) { return ellint_rc(x, y, policies::policy<>()); From 2502444b913f6dce44436ef7050b89e54eb28346 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 13:54:08 -0400 Subject: [PATCH 06/44] Add GPU support to ellint rd --- .../math/special_functions/ellint_rd.hpp | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/include/boost/math/special_functions/ellint_rd.hpp b/include/boost/math/special_functions/ellint_rd.hpp index 2a79e54ca..174a9cbea 100644 --- a/include/boost/math/special_functions/ellint_rd.hpp +++ b/include/boost/math/special_functions/ellint_rd.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang, 2015 John Maddock. +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -16,10 +17,11 @@ #pragma once #endif +#include +#include #include #include #include -#include #include // Carlson's elliptic integral of the second kind @@ -32,9 +34,8 @@ template T ellint_rd_imp(T x, T y, T z, const Policy& pol) { BOOST_MATH_STD_USING - using std::swap; - static const char* function = "boost::math::ellint_rd<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rd<%1%>(%1%,%1%,%1%)"; if(x < 0) { @@ -55,9 +56,11 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) // // Special cases from http://dlmf.nist.gov/19.20#iv // - using std::swap; + if(x == z) - swap(x, y); + { + BOOST_MATH_GPU_SAFE_SWAP(x, y); + } if(y == z) { if(x == y) @@ -70,14 +73,14 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) } else { - if((std::max)(x, y) / (std::min)(x, y) > T(1.3)) + if(BOOST_MATH_GPU_SAFE_MAX(x, y) / BOOST_MATH_GPU_SAFE_MIN(x, y) > T(1.3)) return 3 * (ellint_rc_imp(x, y, pol) - sqrt(x) / y) / (2 * (y - x)); // Otherwise fall through to avoid cancellation in the above (RC(x,y) -> 1/x^0.5 as x -> y) } } if(x == y) { - if((std::max)(x, z) / (std::min)(x, z) > T(1.3)) + if(BOOST_MATH_GPU_SAFE_MAX(x, z) / BOOST_MATH_GPU_SAFE_MIN(x, z) > T(1.3)) return 3 * (ellint_rc_imp(z, x, pol) - 1 / sqrt(z)) / (z - x); // Otherwise fall through to avoid cancellation in the above (RC(x,y) -> 1/x^0.5 as x -> y) } @@ -128,7 +131,7 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) T An = (x + y + 3 * z) / 5; T A0 = An; // This has an extra 1.2 fudge factor which is really only needed when x, y and z are close in magnitude: - T Q = pow(tools::epsilon() / 4, -T(1) / 8) * (std::max)((std::max)(An - x, An - y), An - z) * 1.2f; + T Q = pow(tools::epsilon() / 4, -T(1) / 8) * BOOST_MATH_GPU_SAFE_MAX(BOOST_MATH_GPU_SAFE_MAX(An - x, An - y), An - z) * 1.2f; BOOST_MATH_INSTRUMENT_VARIABLE(Q); T lambda, rx, ry, rz; unsigned k = 0; @@ -177,7 +180,7 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rd(T1 x, T2 y, T3 z, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -190,7 +193,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rd(T1 x, T2 y, T3 z) { return ellint_rd(x, y, z, policies::policy<>()); From 93c70bc4cbc4ec13323802ee5e06ddc16af94eeb Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 13:54:13 -0400 Subject: [PATCH 07/44] Add GPU support to ellint rf --- .../math/special_functions/ellint_rf.hpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/boost/math/special_functions/ellint_rf.hpp b/include/boost/math/special_functions/ellint_rf.hpp index c781ac035..39eaca546 100644 --- a/include/boost/math/special_functions/ellint_rf.hpp +++ b/include/boost/math/special_functions/ellint_rf.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang, 2015 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -17,8 +18,9 @@ #pragma once #endif -#include #include +#include +#include #include #include #include @@ -30,21 +32,20 @@ namespace boost { namespace math { namespace detail{ template - T ellint_rf_imp(T x, T y, T z, const Policy& pol) + BOOST_MATH_GPU_ENABLED T ellint_rf_imp(T x, T y, T z, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math; - using std::swap; static const char* function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)"; if(x < 0 || y < 0 || z < 0) { - return policies::raise_domain_error(function, "domain error, all arguments must be non-negative, only sensible result is %1%.", std::numeric_limits::quiet_NaN(), pol); + return policies::raise_domain_error(function, "domain error, all arguments must be non-negative, only sensible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); } if(x + y == 0 || y + z == 0 || z + x == 0) { - return policies::raise_domain_error(function, "domain error, at most one argument can be zero, only sensible result is %1%.", std::numeric_limits::quiet_NaN(), pol); + return policies::raise_domain_error(function, "domain error, at most one argument can be zero, only sensible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); } // // Special cases from http://dlmf.nist.gov/19.20#i @@ -80,9 +81,9 @@ namespace boost { namespace math { namespace detail{ return ellint_rc_imp(x, y, pol); } if(x == 0) - swap(x, z); + BOOST_MATH_GPU_SAFE_SWAP(x, z); else if(y == 0) - swap(y, z); + BOOST_MATH_GPU_SAFE_SWAP(y, z); if(z == 0) { // @@ -105,7 +106,7 @@ namespace boost { namespace math { namespace detail{ T zn = z; T An = (x + y + z) / 3; T A0 = An; - T Q = pow(3 * boost::math::tools::epsilon(), T(-1) / 8) * (std::max)((std::max)(fabs(An - xn), fabs(An - yn)), fabs(An - zn)); + T Q = pow(3 * boost::math::tools::epsilon(), T(-1) / 8) * BOOST_MATH_GPU_SAFE_MAX(BOOST_MATH_GPU_SAFE_MAX(fabs(An - xn), fabs(An - yn)), fabs(An - zn)); T fn = 1; @@ -143,7 +144,7 @@ namespace boost { namespace math { namespace detail{ } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rf(T1 x, T2 y, T3 z, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -156,7 +157,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rf(T1 x, T2 y, T3 z) { return ellint_rf(x, y, z, policies::policy<>()); From 2a99ab97bed70b619eccf65e90ea65972460410f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 13:54:20 -0400 Subject: [PATCH 08/44] Add GPU support to ellint rg --- include/boost/math/special_functions/ellint_rg.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/boost/math/special_functions/ellint_rg.hpp b/include/boost/math/special_functions/ellint_rg.hpp index 051c104bc..dfbef8754 100644 --- a/include/boost/math/special_functions/ellint_rg.hpp +++ b/include/boost/math/special_functions/ellint_rg.hpp @@ -10,8 +10,8 @@ #pragma once #endif -#include #include +#include #include #include #include @@ -21,21 +21,20 @@ namespace boost { namespace math { namespace detail{ template - T ellint_rg_imp(T x, T y, T z, const Policy& pol) + BOOST_MATH_GPU_ENABLED T ellint_rg_imp(T x, T y, T z, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)"; if(x < 0 || y < 0 || z < 0) { - return policies::raise_domain_error(function, "domain error, all arguments must be non-negative, only sensible result is %1%.", std::numeric_limits::quiet_NaN(), pol); + return policies::raise_domain_error(function, "domain error, all arguments must be non-negative, only sensible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); } // // Function is symmetric in x, y and z, but we require // (x - z)(y - z) >= 0 to avoid cancellation error in the result // which implies (for example) x >= z >= y // - using std::swap; if(x < y) swap(x, y); if(x < z) @@ -106,7 +105,7 @@ namespace boost { namespace math { namespace detail{ } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rg(T1 x, T2 y, T3 z, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -119,7 +118,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rg(T1 x, T2 y, T3 z) { return ellint_rg(x, y, z, policies::policy<>()); From 1b8aedf64b8c5e59bc26ae79b6c64f195ad90c31 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 13:54:27 -0400 Subject: [PATCH 09/44] Add GPU support to ellint rj --- .../math/special_functions/ellint_rj.hpp | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/include/boost/math/special_functions/ellint_rj.hpp b/include/boost/math/special_functions/ellint_rj.hpp index f19eac284..66d604b2c 100644 --- a/include/boost/math/special_functions/ellint_rj.hpp +++ b/include/boost/math/special_functions/ellint_rj.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang, 2015 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -18,8 +19,9 @@ #pragma once #endif -#include #include +#include +#include #include #include #include @@ -32,7 +34,7 @@ namespace boost { namespace math { namespace detail{ template -T ellint_rc1p_imp(T y, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_rc1p_imp(T y, const Policy& pol) { using namespace boost::math; // Calculate RC(1, 1 + x) @@ -70,11 +72,11 @@ T ellint_rc1p_imp(T y, const Policy& pol) } template -T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::ellint_rj<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rj<%1%>(%1%,%1%,%1%)"; if(x < 0) { @@ -94,7 +96,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) } if(x + y == 0 || y + z == 0 || z + x == 0) { - return policies::raise_domain_error(function, "At most one argument can be zero, only possible result is %1%.", std::numeric_limits::quiet_NaN(), pol); + return policies::raise_domain_error(function, "At most one argument can be zero, only possible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); } // for p < 0, the integral is singular, return Cauchy principal value @@ -106,11 +108,11 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) // we can just permute the values: // if(x > y) - std::swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); if(y > z) - std::swap(y, z); + BOOST_MATH_GPU_SAFE_SWAP(y, z); if(x > y) - std::swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); BOOST_MATH_ASSERT(x <= y); BOOST_MATH_ASSERT(y <= z); @@ -148,13 +150,12 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) else { // x = y only, permute so y = z: - using std::swap; - swap(x, z); + BOOST_MATH_GPU_SAFE_SWAP(x, z); if(y == p) { return ellint_rd_imp(x, y, y, pol); } - else if((std::max)(y, p) / (std::min)(y, p) > T(1.2)) + else if(BOOST_MATH_GPU_SAFE_MAX(y, p) / BOOST_MATH_GPU_SAFE_MIN(y, p) > T(1.2)) { return 3 * (ellint_rc_imp(x, y, pol) - ellint_rc_imp(x, p, pol)) / (p - y); } @@ -168,7 +169,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) // y = z = p: return ellint_rd_imp(x, y, y, pol); } - else if((std::max)(y, p) / (std::min)(y, p) > T(1.2)) + else if(BOOST_MATH_GPU_SAFE_MAX(y, p) / BOOST_MATH_GPU_SAFE_MIN(y, p) > T(1.2)) { // y = z: return 3 * (ellint_rc_imp(x, y, pol) - ellint_rc_imp(x, p, pol)) / (p - y); @@ -187,7 +188,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) T An = (x + y + z + 2 * p) / 5; T A0 = An; T delta = (p - x) * (p - y) * (p - z); - T Q = pow(tools::epsilon() / 5, -T(1) / 8) * (std::max)((std::max)(fabs(An - x), fabs(An - y)), (std::max)(fabs(An - z), fabs(An - p))); + T Q = pow(tools::epsilon() / 5, -T(1) / 8) * BOOST_MATH_GPU_SAFE_MAX(BOOST_MATH_GPU_SAFE_MAX(fabs(An - x), fabs(An - y)), BOOST_MATH_GPU_SAFE_MAX(fabs(An - z), fabs(An - p))); unsigned n; T lambda; @@ -263,7 +264,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rj(T1 x, T2 y, T3 z, T4 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -278,7 +279,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rj(T1 x, T2 y, T3 z, T4 p) { return ellint_rj(x, y, z, p, policies::policy<>()); From 456644c3274369463243c4be422724e4453a3b07 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 13:54:41 -0400 Subject: [PATCH 10/44] Add GPU support to ellint d --- .../boost/math/special_functions/ellint_d.hpp | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/include/boost/math/special_functions/ellint_d.hpp b/include/boost/math/special_functions/ellint_d.hpp index da1e87ba3..f5a8491f5 100644 --- a/include/boost/math/special_functions/ellint_d.hpp +++ b/include/boost/math/special_functions/ellint_d.hpp @@ -1,5 +1,6 @@ // Copyright (c) 2006 Xiaogang Zhang // Copyright (c) 2006 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -18,6 +19,8 @@ #pragma once #endif +#include +#include #include #include #include @@ -33,16 +36,16 @@ namespace boost { namespace math { template -typename tools::promote_args::type ellint_d(T1 k, T2 phi, const Policy& pol); +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_d(T1 k, T2 phi, const Policy& pol); namespace detail{ template -T ellint_d_imp(T k, const Policy& pol); +BOOST_MATH_GPU_ENABLED T ellint_d_imp(T k, const Policy& pol); // Elliptic integral (Legendre form) of the second kind template -T ellint_d_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_d_imp(T phi, T k, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -113,7 +116,7 @@ T ellint_d_imp(T phi, T k, const Policy& pol) // Complete elliptic integral (Legendre form) of the second kind template -T ellint_d_imp(T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_d_imp(T k, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -135,7 +138,7 @@ T ellint_d_imp(T k, const Policy& pol) } template -inline typename tools::promote_args::type ellint_d(T k, const Policy& pol, const std::true_type&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T k, const Policy& pol, const boost::math::true_type&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -144,7 +147,7 @@ inline typename tools::promote_args::type ellint_d(T k, const Policy& pol, co // Elliptic integral (Legendre form) of the second kind template -inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const std::false_type&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const boost::math::false_type&) { return boost::math::ellint_d(k, phi, policies::policy<>()); } @@ -153,21 +156,21 @@ inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const s // Complete elliptic integral (Legendre form) of the second kind template -inline typename tools::promote_args::type ellint_d(T k) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T k) { return ellint_d(k, policies::policy<>()); } // Elliptic integral (Legendre form) of the second kind template -inline typename tools::promote_args::type ellint_d(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T1 k, T2 phi) { typedef typename policies::is_policy::type tag_type; return detail::ellint_d(k, phi, tag_type()); } template -inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const Policy& pol) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; From e77182726a0479108983b1f516c75d38b97d6cda Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 14:11:03 -0400 Subject: [PATCH 11/44] Add GPU support to ellint_1 --- .../boost/math/special_functions/ellint_1.hpp | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/include/boost/math/special_functions/ellint_1.hpp b/include/boost/math/special_functions/ellint_1.hpp index f7fbbce40..169dfc934 100644 --- a/include/boost/math/special_functions/ellint_1.hpp +++ b/include/boost/math/special_functions/ellint_1.hpp @@ -20,6 +20,7 @@ #endif #include +#include #include #include #include @@ -33,28 +34,28 @@ namespace boost { namespace math { template -typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& pol); +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& pol); namespace detail{ template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&); +BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&); +BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&); +BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); template BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, T one_minus_k2); // Elliptic integral (Legendre form) of the first kind template -T ellint_f_imp(T phi, T k, const Policy& pol, T one_minus_k2) +BOOST_MATH_GPU_ENABLED T ellint_f_imp(T phi, T k, const Policy& pol, T one_minus_k2) { BOOST_MATH_STD_USING using namespace boost::math::tools; using namespace boost::math::constants; - static const char* function = "boost::math::ellint_f<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::ellint_f<%1%>(%1%,%1%)"; BOOST_MATH_INSTRUMENT_VARIABLE(phi); BOOST_MATH_INSTRUMENT_VARIABLE(k); BOOST_MATH_INSTRUMENT_VARIABLE(function); @@ -151,19 +152,19 @@ T ellint_f_imp(T phi, T k, const Policy& pol, T one_minus_k2) } template -inline T ellint_f_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T ellint_f_imp(T phi, T k, const Policy& pol) { return ellint_f_imp(phi, k, pol, T(1 - k * k)); } // Complete elliptic integral (Legendre form) of the first kind template -T ellint_k_imp(T k, const Policy& pol, T one_minus_k2) +BOOST_MATH_GPU_ENABLED T ellint_k_imp(T k, const Policy& pol, T one_minus_k2) { BOOST_MATH_STD_USING using namespace boost::math::tools; - static const char* function = "boost::math::ellint_k<%1%>(%1%)"; + constexpr auto function = "boost::math::ellint_k<%1%>(%1%)"; if (abs(k) > 1) { @@ -181,7 +182,7 @@ T ellint_k_imp(T k, const Policy& pol, T one_minus_k2) return value; } template -inline T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED inline T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&) { return ellint_k_imp(k, pol, T(1 - k * k)); } @@ -203,9 +204,9 @@ inline T ellint_k_imp(T k, const Policy& pol, std::integral_constant con // archived in the code below), but was found to have slightly higher error rates. // template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&) { - using std::abs; + BOOST_MATH_STD_USING using namespace boost::math::tools; T m = k * k; @@ -456,7 +457,7 @@ BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_cons // This handles all cases where m > 0.9, // including all error handling: // - return ellint_k_imp(k, pol, std::integral_constant()); + return ellint_k_imp(k, pol, boost::math::integral_constant()); #if 0 else { @@ -476,9 +477,9 @@ BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_cons } } template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&) { - using std::abs; + BOOST_MATH_STD_USING using namespace boost::math::tools; T m = k * k; @@ -757,28 +758,28 @@ BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_cons // All cases where m > 0.9 // including all error handling: // - return ellint_k_imp(k, pol, std::integral_constant()); + return ellint_k_imp(k, pol, boost::math::integral_constant()); } } template -typename tools::promote_args::type ellint_1(T k, const Policy& pol, const std::true_type&) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T k, const Policy& pol, const boost::math::true_type&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; - typedef std::integral_constant::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 #endif > precision_tag_type; return policies::checked_narrowing_cast(detail::ellint_k_imp(static_cast(k), pol, precision_tag_type()), "boost::math::ellint_1<%1%>(%1%)"); } template -typename tools::promote_args::type ellint_1(T1 k, T2 phi, const std::false_type&) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, T2 phi, const boost::math::false_type&) { return boost::math::ellint_1(k, phi, policies::policy<>()); } @@ -787,14 +788,14 @@ typename tools::promote_args::type ellint_1(T1 k, T2 phi, const std::fal // Complete elliptic integral (Legendre form) of the first kind template -typename tools::promote_args::type ellint_1(T k) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T k) { return ellint_1(k, policies::policy<>()); } // Elliptic integral (Legendre form) of the first kind template -typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& pol) // LCOV_EXCL_LINE gcc misses this but sees the function body, strange! +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& pol) // LCOV_EXCL_LINE gcc misses this but sees the function body, strange! { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -802,7 +803,7 @@ typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& } template -typename tools::promote_args::type ellint_1(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, T2 phi) { typedef typename policies::is_policy::type tag_type; return detail::ellint_1(k, phi, tag_type()); From e1fd1538e047941134a3996563a87990a45a065d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 14:16:17 -0400 Subject: [PATCH 12/44] Markup forward and add ellint_3 return type def for NVRTC platform --- .../boost/math/special_functions/math_fwd.hpp | 87 +++++++++++-------- 1 file changed, 51 insertions(+), 36 deletions(-) diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 2f4ccd5e7..7bba2f93c 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -26,6 +26,7 @@ #include #include // for argument promotion. +#include #ifdef BOOST_MATH_HAS_NVRTC @@ -36,6 +37,20 @@ template BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type beta(RT1 a, RT2 b, A arg); +namespace detail{ + + template + struct ellint_3_result + { + using type = typename boost::math::conditional< + policies::is_policy::value, + tools::promote_args_t, + tools::promote_args_t + >::type; + }; + + } // namespace detail + } // namespace math } // namespace boost @@ -329,71 +344,71 @@ namespace boost // Elliptic integrals: template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rf(T1 x, T2 y, T3 z); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rf(T1 x, T2 y, T3 z, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rd(T1 x, T2 y, T3 z); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rd(T1 x, T2 y, T3 z, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rc(T1 x, T2 y); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rc(T1 x, T2 y, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rj(T1 x, T2 y, T3 z, T4 p); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rj(T1 x, T2 y, T3 z, T4 p, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rg(T1 x, T2 y, T3 z); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rg(T1 x, T2 y, T3 z, const Policy& pol); template - tools::promote_args_t ellint_2(T k); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_2(T k); template - tools::promote_args_t ellint_2(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_2(T1 k, T2 phi); template - tools::promote_args_t ellint_2(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_2(T1 k, T2 phi, const Policy& pol); template - tools::promote_args_t ellint_1(T k); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_1(T k); template - tools::promote_args_t ellint_1(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_1(T1 k, T2 phi); template - tools::promote_args_t ellint_1(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_1(T1 k, T2 phi, const Policy& pol); template - tools::promote_args_t ellint_d(T k); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_d(T k); template - tools::promote_args_t ellint_d(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_d(T1 k, T2 phi); template - tools::promote_args_t ellint_d(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_d(T1 k, T2 phi, const Policy& pol); template tools::promote_args_t jacobi_zeta(T1 k, T2 phi); @@ -412,7 +427,7 @@ namespace boost template struct ellint_3_result { - using type = typename std::conditional< + using type = typename boost::math::conditional< policies::is_policy::value, tools::promote_args_t, tools::promote_args_t @@ -423,13 +438,13 @@ namespace boost template - typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi); + BOOST_MATH_GPU_ENABLED typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi); template - tools::promote_args_t ellint_3(T1 k, T2 v, T3 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_3(T1 k, T2 v, T3 phi, const Policy& pol); template - tools::promote_args_t ellint_3(T1 k, T2 v); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_3(T1 k, T2 v); // Factorial functions. // Note: not for integral types, at present. @@ -1365,36 +1380,36 @@ namespace boost spherical_harmonic_i(unsigned n, int m, T1 theta, T2 phi, const Policy& pol);\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rf(T1 x, T2 y, T3 z){ return ::boost::math::ellint_rf(x, y, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rd(T1 x, T2 y, T3 z){ return ::boost::math::ellint_rd(x, y, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rc(T1 x, T2 y){ return ::boost::math::ellint_rc(x, y, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rj(T1 x, T2 y, T3 z, T4 p){ return boost::math::ellint_rj(x, y, z, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rg(T1 x, T2 y, T3 z){ return ::boost::math::ellint_rg(x, y, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_2(T k){ return boost::math::ellint_2(k, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_2(T k){ return boost::math::ellint_2(k, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_2(T1 k, T2 phi){ return boost::math::ellint_2(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_2(T1 k, T2 phi){ return boost::math::ellint_2(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_d(T k){ return boost::math::ellint_d(k, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_d(T k){ return boost::math::ellint_d(k, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_d(T1 k, T2 phi){ return boost::math::ellint_d(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_d(T1 k, T2 phi){ return boost::math::ellint_d(k, phi, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t jacobi_zeta(T1 k, T2 phi){ return boost::math::jacobi_zeta(k, phi, Policy()); }\ @@ -1403,16 +1418,16 @@ namespace boost inline boost::math::tools::promote_args_t heuman_lambda(T1 k, T2 phi){ return boost::math::heuman_lambda(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_1(T k){ return boost::math::ellint_1(k, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_1(T k){ return boost::math::ellint_1(k, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_1(T1 k, T2 phi){ return boost::math::ellint_1(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_1(T1 k, T2 phi){ return boost::math::ellint_1(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_3(T1 k, T2 v, T3 phi){ return boost::math::ellint_3(k, v, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_3(T1 k, T2 v, T3 phi){ return boost::math::ellint_3(k, v, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_3(T1 k, T2 v){ return boost::math::ellint_3(k, v, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_3(T1 k, T2 v){ return boost::math::ellint_3(k, v, Policy()); }\ \ using boost::math::max_factorial;\ template \ From e5b3f70d54c33cd5ac319f5a342d56fccb6cf977 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 14:32:42 -0400 Subject: [PATCH 13/44] Add CUDA testing of ellint 1 --- .../boost/math/special_functions/ellint_1.hpp | 8 +- test/cuda_jamfile | 3 + test/test_ellint_1_double.cu | 100 ++++++++++++++++++ test/test_ellint_1_float.cu | 100 ++++++++++++++++++ 4 files changed, 207 insertions(+), 4 deletions(-) create mode 100644 test/test_ellint_1_double.cu create mode 100644 test/test_ellint_1_float.cu diff --git a/include/boost/math/special_functions/ellint_1.hpp b/include/boost/math/special_functions/ellint_1.hpp index 169dfc934..36e492784 100644 --- a/include/boost/math/special_functions/ellint_1.hpp +++ b/include/boost/math/special_functions/ellint_1.hpp @@ -39,13 +39,13 @@ BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, namespace detail{ template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, T one_minus_k2); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, T one_minus_k2); // Elliptic integral (Legendre form) of the first kind template diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 0e4ead125..86cbb5ea6 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -318,6 +318,9 @@ run test_cos_pi_float.cu ; run test_digamma_double.cu ; run test_digamma_float.cu ; +run test_ellint_1_double.cu ; +run test_ellint_1_float.cu ; + run test_erf_double.cu ; run test_erf_float.cu ; run test_erf_inv_double.cu ; diff --git a/test/test_ellint_1_double.cu b/test/test_ellint_1_double.cu new file mode 100644 index 000000000..8de959d22 --- /dev/null +++ b/test/test_ellint_1_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_1_float.cu b/test/test_ellint_1_float.cu new file mode 100644 index 000000000..8de959d22 --- /dev/null +++ b/test/test_ellint_1_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From d9144de1c0784215a4be060a2da48708e9499ee5 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 14:55:32 -0400 Subject: [PATCH 14/44] NVRTC fixes --- .../boost/math/special_functions/ellint_1.hpp | 16 ++++++++-------- .../boost/math/special_functions/math_fwd.hpp | 1 + include/boost/math/tools/workaround.hpp | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/boost/math/special_functions/ellint_1.hpp b/include/boost/math/special_functions/ellint_1.hpp index 36e492784..96c7c9e9b 100644 --- a/include/boost/math/special_functions/ellint_1.hpp +++ b/include/boost/math/special_functions/ellint_1.hpp @@ -784,14 +784,7 @@ BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, return boost::math::ellint_1(k, phi, policies::policy<>()); } -} - -// Complete elliptic integral (Legendre form) of the first kind -template -BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T k) -{ - return ellint_1(k, policies::policy<>()); -} +} // namespace detail // Elliptic integral (Legendre form) of the first kind template @@ -809,6 +802,13 @@ BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, return detail::ellint_1(k, phi, tag_type()); } +// Complete elliptic integral (Legendre form) of the first kind +template +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T k) +{ + return ellint_1(k, policies::policy<>()); +} + }} // namespaces #endif // BOOST_MATH_ELLINT_1_HPP diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 7bba2f93c..b010f5812 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -27,6 +27,7 @@ #include #include // for argument promotion. #include +#include #ifdef BOOST_MATH_HAS_NVRTC diff --git a/include/boost/math/tools/workaround.hpp b/include/boost/math/tools/workaround.hpp index 9b15c4e93..7edd1c12a 100644 --- a/include/boost/math/tools/workaround.hpp +++ b/include/boost/math/tools/workaround.hpp @@ -23,7 +23,7 @@ namespace boost{ namespace math{ namespace tools{ // std::fmod(1185.0L, 1.5L); // template -inline T fmod_workaround(T a, T b) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED inline T fmod_workaround(T a, T b) BOOST_MATH_NOEXCEPT(T) { BOOST_MATH_STD_USING return fmod(a, b); From 012074e85517193ce13dfeb8a3e3ae8f27069a99 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 14:55:41 -0400 Subject: [PATCH 15/44] Add NVRTC testing of ellint_1 --- test/nvrtc_jamfile | 3 + test/test_ellint_1_nvrtc_double.cpp | 190 ++++++++++++++++++++++++++++ test/test_ellint_1_nvrtc_float.cpp | 190 ++++++++++++++++++++++++++++ 3 files changed, 383 insertions(+) create mode 100644 test/test_ellint_1_nvrtc_double.cpp create mode 100644 test/test_ellint_1_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index bab6596fa..5dc163e28 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -322,6 +322,9 @@ run test_erf_inv_nvrtc_float.cpp ; run test_erfc_inv_nvrtc_double.cpp ; run test_erfc_inv_nvrtc_float.cpp ; +run test_ellint_1_nvrtc_double.cpp ; +run test_ellint_1_nvrtc_float.cpp ; + run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; diff --git a/test/test_ellint_1_nvrtc_double.cpp b/test/test_ellint_1_nvrtc_double.cpp new file mode 100644 index 000000000..fac5da55f --- /dev/null +++ b/test/test_ellint_1_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_1_nvrtc_float.cpp b/test/test_ellint_1_nvrtc_float.cpp new file mode 100644 index 000000000..fac5da55f --- /dev/null +++ b/test/test_ellint_1_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 153831e203bab67d3adeccb6c4c30d2a312e9d2f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 15:30:04 -0400 Subject: [PATCH 16/44] Add GPU support to ellint_2 --- .../boost/math/special_functions/ellint_2.hpp | 55 ++++++++++--------- .../math/special_functions/ellint_rd.hpp | 6 +- .../math/special_functions/ellint_rg.hpp | 10 ++-- 3 files changed, 38 insertions(+), 33 deletions(-) diff --git a/include/boost/math/special_functions/ellint_2.hpp b/include/boost/math/special_functions/ellint_2.hpp index 5e2552cec..dc8e86e8e 100644 --- a/include/boost/math/special_functions/ellint_2.hpp +++ b/include/boost/math/special_functions/ellint_2.hpp @@ -19,6 +19,9 @@ #pragma once #endif +#include +#include +#include #include #include #include @@ -34,20 +37,20 @@ namespace boost { namespace math { template -typename tools::promote_args::type ellint_2(T1 k, T2 phi, const Policy& pol); +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, T2 phi, const Policy& pol); namespace detail{ template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const std::integral_constant&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const boost::math::integral_constant&); template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const std::integral_constant&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const boost::math::integral_constant&); template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const std::integral_constant&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const boost::math::integral_constant&); // Elliptic integral (Legendre form) of the second kind template -T ellint_e_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_e_imp(T phi, T k, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -72,9 +75,9 @@ T ellint_e_imp(T phi, T k, const Policy& pol) } else if(phi > 1 / tools::epsilon()) { - typedef std::integral_constant::value&& std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 + typedef boost::math::integral_constant::value&& boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 > precision_tag_type; // Phi is so large that phi%pi is necessarily zero (or garbage), // just return the second part of the duplication formula: @@ -139,9 +142,9 @@ T ellint_e_imp(T phi, T k, const Policy& pol) } if (m != 0) { - typedef std::integral_constant::value&& std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 + typedef boost::math::integral_constant::value&& boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 > precision_tag_type; result += m * ellint_e_imp(k, pol, precision_tag_type()); } @@ -151,7 +154,7 @@ T ellint_e_imp(T phi, T k, const Policy& pol) // Complete elliptic integral (Legendre form) of the second kind template -T ellint_e_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED T ellint_e_imp(T k, const Policy& pol, boost::math::integral_constant const&) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -189,9 +192,9 @@ T ellint_e_imp(T k, const Policy& pol, std::integral_constant const&) // existing routines. // template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, boost::math::integral_constant const&) { - using std::abs; + BOOST_MATH_STD_USING using namespace boost::math::tools; T m = k * k; @@ -424,13 +427,13 @@ BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, std::integral_cons // All cases where m > 0.9 // including all error handling: // - return ellint_e_imp(k, pol, std::integral_constant()); + return ellint_e_imp(k, pol, boost::math::integral_constant()); } } template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, boost::math::integral_constant const&) { - using std::abs; + BOOST_MATH_STD_USING using namespace boost::math::tools; T m = k * k; @@ -697,25 +700,25 @@ BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, std::integral_cons // All cases where m > 0.9 // including all error handling: // - return ellint_e_imp(k, pol, std::integral_constant()); + return ellint_e_imp(k, pol, boost::math::integral_constant()); } } template -typename tools::promote_args::type ellint_2(T k, const Policy& pol, const std::true_type&) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T k, const Policy& pol, const boost::math::true_type&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; - typedef std::integral_constant::value&& std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 + typedef boost::math::integral_constant::value&& boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 > precision_tag_type; return policies::checked_narrowing_cast(detail::ellint_e_imp(static_cast(k), pol, precision_tag_type()), "boost::math::ellint_2<%1%>(%1%)"); } // Elliptic integral (Legendre form) of the second kind template -typename tools::promote_args::type ellint_2(T1 k, T2 phi, const std::false_type&) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, T2 phi, const boost::math::false_type&) { return boost::math::ellint_2(k, phi, policies::policy<>()); } @@ -724,21 +727,21 @@ typename tools::promote_args::type ellint_2(T1 k, T2 phi, const std::fal // Complete elliptic integral (Legendre form) of the second kind template -typename tools::promote_args::type ellint_2(T k) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T k) { return ellint_2(k, policies::policy<>()); } // Elliptic integral (Legendre form) of the second kind template -typename tools::promote_args::type ellint_2(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, T2 phi) { typedef typename policies::is_policy::type tag_type; return detail::ellint_2(k, phi, tag_type()); } template -typename tools::promote_args::type ellint_2(T1 k, T2 phi, const Policy& pol) // LCOV_EXCL_LINE gcc misses this but sees the function body, strange! +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, T2 phi, const Policy& pol) // LCOV_EXCL_LINE gcc misses this but sees the function body, strange! { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; diff --git a/include/boost/math/special_functions/ellint_rd.hpp b/include/boost/math/special_functions/ellint_rd.hpp index 174a9cbea..395be4e70 100644 --- a/include/boost/math/special_functions/ellint_rd.hpp +++ b/include/boost/math/special_functions/ellint_rd.hpp @@ -31,7 +31,7 @@ namespace boost { namespace math { namespace detail{ template -T ellint_rd_imp(T x, T y, T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_rd_imp(T x, T y, T z, const Policy& pol) { BOOST_MATH_STD_USING @@ -85,7 +85,9 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) // Otherwise fall through to avoid cancellation in the above (RC(x,y) -> 1/x^0.5 as x -> y) } if(y == 0) - swap(x, y); + { + BOOST_MATH_GPU_SAFE_SWAP(x, y); + } if(x == 0) { // diff --git a/include/boost/math/special_functions/ellint_rg.hpp b/include/boost/math/special_functions/ellint_rg.hpp index dfbef8754..8a7f706ac 100644 --- a/include/boost/math/special_functions/ellint_rg.hpp +++ b/include/boost/math/special_functions/ellint_rg.hpp @@ -36,11 +36,11 @@ namespace boost { namespace math { namespace detail{ // which implies (for example) x >= z >= y // if(x < y) - swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); if(x < z) - swap(x, z); + BOOST_MATH_GPU_SAFE_SWAP(x, z); if(y > z) - swap(y, z); + BOOST_MATH_GPU_SAFE_SWAP(y, z); BOOST_MATH_ASSERT(x >= z); BOOST_MATH_ASSERT(z >= y); @@ -63,7 +63,7 @@ namespace boost { namespace math { namespace detail{ else { // x = z, y != 0 - swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); return (x == 0) ? T(sqrt(z) / 2) : T((z * ellint_rc_imp(x, z, pol) + sqrt(x)) / 2); } } @@ -74,7 +74,7 @@ namespace boost { namespace math { namespace detail{ } else if(y == 0) { - swap(y, z); + BOOST_MATH_GPU_SAFE_SWAP(y, z); // // Special handling for common case, from // Numerical Computation of Real or Complex Elliptic Integrals, eq.46 From d74ba4a3a2050bedf2c003f9486e59fdc256b975 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 15:38:51 -0400 Subject: [PATCH 17/44] Add CUDA testing of ellint_2 --- test/cuda_jamfile | 2 + test/test_ellint_1_double.cu | 16 ++++-- test/test_ellint_2_double.cu | 100 +++++++++++++++++++++++++++++++++++ test/test_ellint_2_float.cu | 100 +++++++++++++++++++++++++++++++++++ 4 files changed, 213 insertions(+), 5 deletions(-) create mode 100644 test/test_ellint_2_double.cu create mode 100644 test/test_ellint_2_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 86cbb5ea6..2eb4ab09a 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -320,6 +320,8 @@ run test_digamma_float.cu ; run test_ellint_1_double.cu ; run test_ellint_1_float.cu ; +run test_ellint_2_double.cu ; +run test_ellint_2_float.cu ; run test_erf_double.cu ; run test_erf_float.cu ; diff --git a/test/test_ellint_1_double.cu b/test/test_ellint_1_double.cu index 8de959d22..eb9bfb162 100644 --- a/test/test_ellint_1_double.cu +++ b/test/test_ellint_1_double.cu @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "cuda_managed_ptr.hpp" #include "stopwatch.hpp" @@ -15,7 +16,7 @@ // For the CUDA runtime routines (prefixed with "cuda_") #include -typedef float float_type; +typedef double float_type; /** * CUDA Kernel Device code @@ -51,9 +52,11 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); for (int i = 0; i < numElements; ++i) { - input_vector[i] = rand()/(float_type)RAND_MAX; + input_vector[i] = dist(rng); } // Launch the Vector Add CUDA Kernel @@ -86,10 +89,13 @@ int main(void) // check the results for(int i = 0; i < numElements; ++i) { - if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + if (std::isfinite(results[i])) { - std::cerr << "Result verification failed at element " << i << "!" << std::endl; - return EXIT_FAILURE; + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } } } diff --git a/test/test_ellint_2_double.cu b/test/test_ellint_2_double.cu new file mode 100644 index 000000000..2e1073576 --- /dev/null +++ b/test/test_ellint_2_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_2(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_2(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_2_float.cu b/test/test_ellint_2_float.cu new file mode 100644 index 000000000..a55a6d1ad --- /dev/null +++ b/test/test_ellint_2_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_2(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_2(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 118df512fbd460eb21e5574e73f29bc35f7cda52 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 15:44:09 -0400 Subject: [PATCH 18/44] Fix NVRTC errors --- .../boost/math/special_functions/ellint_2.hpp | 16 +++++++++------- .../boost/math/special_functions/ellint_rd.hpp | 4 ++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/boost/math/special_functions/ellint_2.hpp b/include/boost/math/special_functions/ellint_2.hpp index dc8e86e8e..0cc1fa094 100644 --- a/include/boost/math/special_functions/ellint_2.hpp +++ b/include/boost/math/special_functions/ellint_2.hpp @@ -725,13 +725,6 @@ BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, } // detail -// Complete elliptic integral (Legendre form) of the second kind -template -BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T k) -{ - return ellint_2(k, policies::policy<>()); -} - // Elliptic integral (Legendre form) of the second kind template BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, T2 phi) @@ -748,6 +741,15 @@ BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, return policies::checked_narrowing_cast(detail::ellint_e_imp(static_cast(phi), static_cast(k), pol), "boost::math::ellint_2<%1%>(%1%,%1%)"); } + +// Complete elliptic integral (Legendre form) of the second kind +template +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T k) +{ + return ellint_2(k, policies::policy<>()); +} + + }} // namespaces #endif // BOOST_MATH_ELLINT_2_HPP diff --git a/include/boost/math/special_functions/ellint_rd.hpp b/include/boost/math/special_functions/ellint_rd.hpp index 395be4e70..f2a33adc4 100644 --- a/include/boost/math/special_functions/ellint_rd.hpp +++ b/include/boost/math/special_functions/ellint_rd.hpp @@ -21,7 +21,6 @@ #include #include #include -#include #include // Carlson's elliptic integral of the second kind @@ -107,7 +106,8 @@ BOOST_MATH_GPU_ENABLED T ellint_rd_imp(T x, T y, T z, const Policy& pol) xn = (xn + yn) / 2; yn = t; sum_pow *= 2; - sum += sum_pow * boost::math::pow<2>(xn - yn); + const auto temp = (xn - yn); + sum += sum_pow * temp * temp; } T RF = constants::pi() / (xn + yn); // From 660974e1a8ccc54e9156b91e851037eae6f214ad Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 15:44:40 -0400 Subject: [PATCH 19/44] Add NVRTC testing of ellint_2 --- test/nvrtc_jamfile | 2 + test/test_ellint_2_nvrtc_double.cpp | 190 ++++++++++++++++++++++++++++ test/test_ellint_2_nvrtc_float.cpp | 190 ++++++++++++++++++++++++++++ 3 files changed, 382 insertions(+) create mode 100644 test/test_ellint_2_nvrtc_double.cpp create mode 100644 test/test_ellint_2_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 5dc163e28..1d3443bba 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -324,6 +324,8 @@ run test_erfc_inv_nvrtc_float.cpp ; run test_ellint_1_nvrtc_double.cpp ; run test_ellint_1_nvrtc_float.cpp ; +run test_ellint_2_nvrtc_double.cpp ; +run test_ellint_2_nvrtc_float.cpp ; run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; diff --git a/test/test_ellint_2_nvrtc_double.cpp b/test/test_ellint_2_nvrtc_double.cpp new file mode 100644 index 000000000..dd2eef154 --- /dev/null +++ b/test/test_ellint_2_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_2_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_2(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_2(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_2_nvrtc_float.cpp b/test/test_ellint_2_nvrtc_float.cpp new file mode 100644 index 000000000..dd2eef154 --- /dev/null +++ b/test/test_ellint_2_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_2_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_2(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_2(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 692c5aa6cd55e15f0273a6162a03ba84274cdb73 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 16:03:51 -0400 Subject: [PATCH 20/44] Add GPU support to atanh --- include/boost/math/special_functions/atanh.hpp | 10 +++++----- include/boost/math/special_functions/math_fwd.hpp | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/boost/math/special_functions/atanh.hpp b/include/boost/math/special_functions/atanh.hpp index 543fb5fce..9d73e568c 100644 --- a/include/boost/math/special_functions/atanh.hpp +++ b/include/boost/math/special_functions/atanh.hpp @@ -15,7 +15,7 @@ #pragma once #endif -#include +#include #include #include #include @@ -33,10 +33,10 @@ namespace boost // This is the main fare template - inline T atanh_imp(const T x, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline T atanh_imp(const T x, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::atanh<%1%>(%1%)"; + constexpr auto function = "boost::math::atanh<%1%>(%1%)"; if(x < -1) { @@ -87,7 +87,7 @@ namespace boost } template - inline typename tools::promote_args::type atanh(T x, const Policy&) + BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type atanh(T x, const Policy&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -102,7 +102,7 @@ namespace boost "boost::math::atanh<%1%>(%1%)"); } template - inline typename tools::promote_args::type atanh(T x) + BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type atanh(T x) { return boost::math::atanh(x, policies::policy<>()); } diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index b010f5812..44600422d 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -664,10 +664,10 @@ namespace boost tools::promote_args_t acosh(T x, const Policy&); template - tools::promote_args_t atanh(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t atanh(T x); template - tools::promote_args_t atanh(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t atanh(T x, const Policy&); namespace detail{ @@ -1531,7 +1531,7 @@ namespace boost inline boost::math::tools::promote_args_t acosh(const T x){ return boost::math::acosh(x, Policy()); }\ \ template\ - inline boost::math::tools::promote_args_t atanh(const T x){ return boost::math::atanh(x, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t atanh(const T x){ return boost::math::atanh(x, Policy()); }\ \ template \ inline typename boost::math::detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x)\ From 1d6831846b9fe33bdc2cf3cec0f44fc90e9ea433 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 16:08:58 -0400 Subject: [PATCH 21/44] Add GPU support to ellint_3 --- .../boost/math/special_functions/ellint_3.hpp | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/include/boost/math/special_functions/ellint_3.hpp b/include/boost/math/special_functions/ellint_3.hpp index 33acc545d..5ca11c193 100644 --- a/include/boost/math/special_functions/ellint_3.hpp +++ b/include/boost/math/special_functions/ellint_3.hpp @@ -18,6 +18,8 @@ #pragma once #endif +#include +#include #include #include #include @@ -38,16 +40,16 @@ namespace boost { namespace math { namespace detail{ template -T ellint_pi_imp(T v, T k, T vc, const Policy& pol); +BOOST_MATH_GPU_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol); // Elliptic integral (Legendre form) of the third kind template -T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol) { // Note vc = 1-v presumably without cancellation error. BOOST_MATH_STD_USING - static const char* function = "boost::math::ellint_3<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_3<%1%>(%1%,%1%,%1%)"; T sphi = sin(fabs(phi)); @@ -270,13 +272,13 @@ T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol) // Complete elliptic integral (Legendre form) of the third kind template -T ellint_pi_imp(T v, T k, T vc, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol) { // Note arg vc = 1-v, possibly without cancellation errors BOOST_MATH_STD_USING using namespace boost::math::tools; - static const char* function = "boost::math::ellint_pi<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::ellint_pi<%1%>(%1%,%1%)"; if (abs(k) >= 1) { @@ -318,13 +320,13 @@ T ellint_pi_imp(T v, T k, T vc, const Policy& pol) } template -inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const std::false_type&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const boost::math::false_type&) { return boost::math::ellint_3(k, v, phi, policies::policy<>()); } template -inline typename tools::promote_args::type ellint_3(T1 k, T2 v, const Policy& pol, const std::true_type&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, const Policy& pol, const boost::math::true_type&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -339,7 +341,7 @@ inline typename tools::promote_args::type ellint_3(T1 k, T2 v, const Pol } // namespace detail template -inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const Policy&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const Policy&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -354,14 +356,14 @@ inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 ph } template -typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi) +BOOST_MATH_GPU_ENABLED typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi) { typedef typename policies::is_policy::type tag_type; return detail::ellint_3(k, v, phi, tag_type()); } template -inline typename tools::promote_args::type ellint_3(T1 k, T2 v) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v) { return ellint_3(k, v, policies::policy<>()); } From a704f03e295ff67fa375dd918d789fd9d032dc6f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 16:09:07 -0400 Subject: [PATCH 22/44] Add NVRTC testing of ellint_3 --- test/nvrtc_jamfile | 2 + test/test_ellint_3_nvrtc_double.cpp | 190 ++++++++++++++++++++++++++++ test/test_ellint_3_nvrtc_float.cpp | 190 ++++++++++++++++++++++++++++ 3 files changed, 382 insertions(+) create mode 100644 test/test_ellint_3_nvrtc_double.cpp create mode 100644 test/test_ellint_3_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 1d3443bba..fb945f1e3 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -326,6 +326,8 @@ run test_ellint_1_nvrtc_double.cpp ; run test_ellint_1_nvrtc_float.cpp ; run test_ellint_2_nvrtc_double.cpp ; run test_ellint_2_nvrtc_float.cpp ; +run test_ellint_3_nvrtc_double.cpp ; +run test_ellint_3_nvrtc_float.cpp ; run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; diff --git a/test/test_ellint_3_nvrtc_double.cpp b/test/test_ellint_3_nvrtc_double.cpp new file mode 100644 index 000000000..dacab6619 --- /dev/null +++ b/test/test_ellint_3_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_3_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_3_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_3_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_3_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_3(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_3_nvrtc_float.cpp b/test/test_ellint_3_nvrtc_float.cpp new file mode 100644 index 000000000..72b2ec71e --- /dev/null +++ b/test/test_ellint_3_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ellint_3_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_3_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_3_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_3_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_3(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From c46b3f23bb48f963d8fefae99a037f2c8acaae1f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Sep 2024 16:18:00 -0400 Subject: [PATCH 23/44] Add CUDA testing of ellint_3 --- test/cuda_jamfile | 3 + test/test_ellint_3_double.cu | 104 +++++++++++++++++++++++++++++++++++ test/test_ellint_3_float.cu | 104 +++++++++++++++++++++++++++++++++++ 3 files changed, 211 insertions(+) create mode 100644 test/test_ellint_3_double.cu create mode 100644 test/test_ellint_3_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 2eb4ab09a..c7841f92d 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -9,6 +9,9 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; +run test_ellint_3_double.cu ; +run test_ellint_3_float.cu ; + # Quad run test_exp_sinh_quad_float.cu ; run test_exp_sinh_quad_double.cu ; diff --git a/test/test_ellint_3_double.cu b/test/test_ellint_3_double.cu new file mode 100644 index 000000000..979e01ff1 --- /dev/null +++ b/test/test_ellint_3_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_3_float.cu b/test/test_ellint_3_float.cu new file mode 100644 index 000000000..979e01ff1 --- /dev/null +++ b/test/test_ellint_3_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 7ed03791fa7a4f99f9ff058f193b995ef0ff51de Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 09:01:07 -0400 Subject: [PATCH 24/44] Replace use of static const char* --- include/boost/math/special_functions/ellint_rf.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/math/special_functions/ellint_rf.hpp b/include/boost/math/special_functions/ellint_rf.hpp index 39eaca546..eb1c2b6e7 100644 --- a/include/boost/math/special_functions/ellint_rf.hpp +++ b/include/boost/math/special_functions/ellint_rf.hpp @@ -37,7 +37,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_STD_USING using namespace boost::math; - static const char* function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)"; if(x < 0 || y < 0 || z < 0) { From 3304a9c72f2560c0957139f5292f74ed6db274a1 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 09:01:17 -0400 Subject: [PATCH 25/44] Add SYCL testing of ellint_1 --- test/sycl_jamfile | 2 ++ test/test_ellint_1.cpp | 3 +++ test/test_ellint_1.hpp | 6 ++++++ 3 files changed, 11 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 9c8978a4e..4678d7f33 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -61,6 +61,8 @@ run test_bessel_y.cpp ; run test_cbrt.cpp ; +run test_ellint_1.cpp ; + run test_sign.cpp ; run test_round.cpp ; diff --git a/test/test_ellint_1.cpp b/test/test_ellint_1.cpp index b5cb2a359..db82132a5 100644 --- a/test/test_ellint_1.cpp +++ b/test/test_ellint_1.cpp @@ -6,7 +6,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_ellint_1.hpp" // diff --git a/test/test_ellint_1.hpp b/test/test_ellint_1.hpp index 635bcf229..786841302 100644 --- a/test/test_ellint_1.hpp +++ b/test/test_ellint_1.hpp @@ -9,11 +9,15 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif #define BOOST_TEST_MAIN #include #include #include +#include #include #include "functor.hpp" @@ -139,11 +143,13 @@ void test_spots(T, const char* type_name) // // Test error handling: // + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_GE(boost::math::ellint_1(T(1)), boost::math::tools::max_value()); BOOST_CHECK_GE(boost::math::ellint_1(T(-1)), boost::math::tools::max_value()); BOOST_CHECK_THROW(boost::math::ellint_1(T(1.0001)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_1(T(-1.0001)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_1(T(2.2), T(0.5)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_1(T(-2.2), T(0.5)), std::domain_error); + #endif } From a323d5a41faff3ce9aeeaafb8f2a4a9a98eca08b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 09:05:50 -0400 Subject: [PATCH 26/44] Add SYCL testing of ellint 2 with slight tolerance bump --- test/sycl_jamfile | 1 + test/test_ellint_2.cpp | 7 +++++++ test/test_ellint_2.hpp | 9 +++++++++ 3 files changed, 17 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 4678d7f33..ee47dadae 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -62,6 +62,7 @@ run test_bessel_y.cpp ; run test_cbrt.cpp ; run test_ellint_1.cpp ; +run test_ellint_2.cpp ; run test_sign.cpp ; diff --git a/test/test_ellint_2.cpp b/test/test_ellint_2.cpp index ca3e994d4..0da012c13 100644 --- a/test/test_ellint_2.cpp +++ b/test/test_ellint_2.cpp @@ -6,7 +6,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_ellint_2.hpp" // @@ -72,7 +75,11 @@ void expected_results() ".*", // platform largest_type, // test type(s) ".*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + ".*", 20, 6); // test function + #else ".*", 15, 6); // test function + #endif add_expected_result( ".*", // compiler ".*", // stdlib diff --git a/test/test_ellint_2.hpp b/test/test_ellint_2.hpp index e38f94d98..29a73c996 100644 --- a/test/test_ellint_2.hpp +++ b/test/test_ellint_2.hpp @@ -9,11 +9,18 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include +#include #include #include "functor.hpp" @@ -157,10 +164,12 @@ void test_spots(T, const char* type_name) // // Test error handling: // + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_EQUAL(boost::math::ellint_2(T(1)), T(1)); BOOST_CHECK_EQUAL(boost::math::ellint_2(T(-1)), T(1)); BOOST_CHECK_THROW(boost::math::ellint_2(T(1.5)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_2(T(-1.5)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_2(T(1.5), T(1.5)), std::domain_error); + #endif } From bd2318fac69645f6fb601ed42ae4a22fe00e0159 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 09:16:55 -0400 Subject: [PATCH 27/44] Remove recursion from ellint_rj --- .../math/special_functions/ellint_rj.hpp | 93 ++++++++++++------- 1 file changed, 62 insertions(+), 31 deletions(-) diff --git a/include/boost/math/special_functions/ellint_rj.hpp b/include/boost/math/special_functions/ellint_rj.hpp index 66d604b2c..76e1a14eb 100644 --- a/include/boost/math/special_functions/ellint_rj.hpp +++ b/include/boost/math/special_functions/ellint_rj.hpp @@ -72,7 +72,7 @@ BOOST_MATH_GPU_ENABLED T ellint_rc1p_imp(T y, const Policy& pol) } template -BOOST_MATH_GPU_ENABLED T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_rj_imp_final(T x, T y, T z, T p, const Policy& pol) { BOOST_MATH_STD_USING @@ -99,36 +99,6 @@ BOOST_MATH_GPU_ENABLED T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) return policies::raise_domain_error(function, "At most one argument can be zero, only possible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); } - // for p < 0, the integral is singular, return Cauchy principal value - if(p < 0) - { - // - // We must ensure that x < y < z. - // Since the integral is symmetrical in x, y and z - // we can just permute the values: - // - if(x > y) - BOOST_MATH_GPU_SAFE_SWAP(x, y); - if(y > z) - BOOST_MATH_GPU_SAFE_SWAP(y, z); - if(x > y) - BOOST_MATH_GPU_SAFE_SWAP(x, y); - - BOOST_MATH_ASSERT(x <= y); - BOOST_MATH_ASSERT(y <= z); - - T q = -p; - p = (z * (x + y + q) - x * y) / (z + q); - - BOOST_MATH_ASSERT(p >= 0); - - T value = (p - z) * ellint_rj_imp(x, y, z, p, pol); - value -= 3 * ellint_rf_imp(x, y, z, pol); - value += 3 * sqrt((x * y * z) / (x * y + p * q)) * ellint_rc_imp(T(x * y + p * q), T(p * q), pol); - value /= (z + q); - return value; - } - // // Special cases from http://dlmf.nist.gov/19.20#iii // @@ -261,6 +231,67 @@ BOOST_MATH_GPU_ENABLED T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) return result; } +template +BOOST_MATH_GPU_ENABLED T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) +{ + BOOST_MATH_STD_USING + + constexpr auto function = "boost::math::ellint_rj<%1%>(%1%,%1%,%1%)"; + + if(x < 0) + { + return policies::raise_domain_error(function, "Argument x must be non-negative, but got x = %1%", x, pol); + } + if(y < 0) + { + return policies::raise_domain_error(function, "Argument y must be non-negative, but got y = %1%", y, pol); + } + if(z < 0) + { + return policies::raise_domain_error(function, "Argument z must be non-negative, but got z = %1%", z, pol); + } + if(p == 0) + { + return policies::raise_domain_error(function, "Argument p must not be zero, but got p = %1%", p, pol); + } + if(x + y == 0 || y + z == 0 || z + x == 0) + { + return policies::raise_domain_error(function, "At most one argument can be zero, only possible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); + } + + // for p < 0, the integral is singular, return Cauchy principal value + if(p < 0) + { + // + // We must ensure that x < y < z. + // Since the integral is symmetrical in x, y and z + // we can just permute the values: + // + if(x > y) + BOOST_MATH_GPU_SAFE_SWAP(x, y); + if(y > z) + BOOST_MATH_GPU_SAFE_SWAP(y, z); + if(x > y) + BOOST_MATH_GPU_SAFE_SWAP(x, y); + + BOOST_MATH_ASSERT(x <= y); + BOOST_MATH_ASSERT(y <= z); + + T q = -p; + p = (z * (x + y + q) - x * y) / (z + q); + + BOOST_MATH_ASSERT(p >= 0); + + T value = (p - z) * ellint_rj_imp_final(x, y, z, p, pol); + value -= 3 * ellint_rf_imp(x, y, z, pol); + value += 3 * sqrt((x * y * z) / (x * y + p * q)) * ellint_rc_imp(T(x * y + p * q), T(p * q), pol); + value /= (z + q); + return value; + } + + return ellint_rj_imp_final(x, y, z, p, pol); +} + } // namespace detail template From 60b750ddd642d769768890910003db21f158868d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 09:37:43 -0400 Subject: [PATCH 28/44] Add ellint_d CUDA testing --- test/cuda_jamfile | 7 ++- test/test_ellint_d_double.cu | 104 +++++++++++++++++++++++++++++++++++ test/test_ellint_d_float.cu | 104 +++++++++++++++++++++++++++++++++++ 3 files changed, 212 insertions(+), 3 deletions(-) create mode 100644 test/test_ellint_d_double.cu create mode 100644 test/test_ellint_d_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index c7841f92d..feb8e0742 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -9,9 +9,6 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; -run test_ellint_3_double.cu ; -run test_ellint_3_float.cu ; - # Quad run test_exp_sinh_quad_float.cu ; run test_exp_sinh_quad_double.cu ; @@ -325,6 +322,10 @@ run test_ellint_1_double.cu ; run test_ellint_1_float.cu ; run test_ellint_2_double.cu ; run test_ellint_2_float.cu ; +run test_ellint_3_double.cu ; +run test_ellint_3_float.cu ; +run test_ellint_d_double.cu ; +run test_ellint_d_float.cu ; run test_erf_double.cu ; run test_erf_float.cu ; diff --git a/test/test_ellint_d_double.cu b/test/test_ellint_d_double.cu new file mode 100644 index 000000000..979e01ff1 --- /dev/null +++ b/test/test_ellint_d_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_d_float.cu b/test/test_ellint_d_float.cu new file mode 100644 index 000000000..50882aa76 --- /dev/null +++ b/test/test_ellint_d_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 51a4f4f4d422c2e26e858aba2a4201c33b48bedf Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 09:52:10 -0400 Subject: [PATCH 29/44] Add NVRTC testing of ellint_d --- test/nvrtc_jamfile | 2 + test/test_ellint_d_nvrtc_double.cpp | 190 ++++++++++++++++++++++++++++ test/test_ellint_d_nvrtc_float.cpp | 190 ++++++++++++++++++++++++++++ 3 files changed, 382 insertions(+) create mode 100644 test/test_ellint_d_nvrtc_double.cpp create mode 100644 test/test_ellint_d_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index fb945f1e3..1285387fb 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -328,6 +328,8 @@ run test_ellint_2_nvrtc_double.cpp ; run test_ellint_2_nvrtc_float.cpp ; run test_ellint_3_nvrtc_double.cpp ; run test_ellint_3_nvrtc_float.cpp ; +run test_ellint_d_nvrtc_double.cpp ; +run test_ellint_d_nvrtc_float.cpp ; run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; diff --git a/test/test_ellint_d_nvrtc_double.cpp b/test/test_ellint_d_nvrtc_double.cpp new file mode 100644 index 000000000..cb65a2e73 --- /dev/null +++ b/test/test_ellint_d_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_d_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_d(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_d_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_d_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_d_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_d(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_d_nvrtc_float.cpp b/test/test_ellint_d_nvrtc_float.cpp new file mode 100644 index 000000000..727d9dcd1 --- /dev/null +++ b/test/test_ellint_d_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ellint_d_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_d(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_d_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_d_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_d_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_d(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From afbb3cb4e79b7d88d03446184e6d4ef14ce809c8 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 10:23:18 -0400 Subject: [PATCH 30/44] Add SYCL testing of ellint_d --- test/sycl_jamfile | 1 + test/test_ellint_d.cpp | 3 +++ test/test_ellint_d.hpp | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index ee47dadae..d475efafa 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -63,6 +63,7 @@ run test_cbrt.cpp ; run test_ellint_1.cpp ; run test_ellint_2.cpp ; +run test_ellint_d.cpp ; run test_sign.cpp ; diff --git a/test/test_ellint_d.cpp b/test/test_ellint_d.cpp index 5e76a49fb..420bc0c02 100644 --- a/test/test_ellint_d.cpp +++ b/test/test_ellint_d.cpp @@ -4,7 +4,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_ellint_d.hpp" // diff --git a/test/test_ellint_d.hpp b/test/test_ellint_d.hpp index de53936f1..c33a4d942 100644 --- a/test/test_ellint_d.hpp +++ b/test/test_ellint_d.hpp @@ -8,11 +8,17 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include #include #include "functor.hpp" @@ -117,6 +123,7 @@ void test_spots(T, const char* type_name) do_test_ellint_d1(ellint_d_data, type_name, "Elliptic Integral D: Random Data"); + #ifdef BOOST_MATH_NO_EXCEPTIONS BOOST_MATH_CHECK_THROW(boost::math::ellint_d(T(1)), std::domain_error); BOOST_MATH_CHECK_THROW(boost::math::ellint_d(T(-1)), std::domain_error); BOOST_MATH_CHECK_THROW(boost::math::ellint_d(T(1.5)), std::domain_error); @@ -126,5 +133,6 @@ void test_spots(T, const char* type_name) BOOST_CHECK_EQUAL(boost::math::ellint_d(T(0.5), std::numeric_limits::infinity()), std::numeric_limits::infinity()); } BOOST_MATH_CHECK_THROW(boost::math::ellint_d(T(1.5), T(1.0)), std::domain_error); + #endif } From 094193a7565b3bc37da4cca883aa040ba86e7504 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 10:47:48 -0400 Subject: [PATCH 31/44] Remove SYCL ellint_3 support --- .../boost/math/special_functions/ellint_3.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/boost/math/special_functions/ellint_3.hpp b/include/boost/math/special_functions/ellint_3.hpp index 5ca11c193..b8df7e264 100644 --- a/include/boost/math/special_functions/ellint_3.hpp +++ b/include/boost/math/special_functions/ellint_3.hpp @@ -40,11 +40,11 @@ namespace boost { namespace math { namespace detail{ template -BOOST_MATH_GPU_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol); +BOOST_MATH_CUDA_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol); // Elliptic integral (Legendre form) of the third kind template -BOOST_MATH_GPU_ENABLED T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol) +BOOST_MATH_CUDA_ENABLED T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol) { // Note vc = 1-v presumably without cancellation error. BOOST_MATH_STD_USING @@ -272,7 +272,7 @@ BOOST_MATH_GPU_ENABLED T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol) // Complete elliptic integral (Legendre form) of the third kind template -BOOST_MATH_GPU_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol) +BOOST_MATH_CUDA_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol) { // Note arg vc = 1-v, possibly without cancellation errors BOOST_MATH_STD_USING @@ -320,13 +320,13 @@ BOOST_MATH_GPU_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol) } template -BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const boost::math::false_type&) +BOOST_MATH_CUDA_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const boost::math::false_type&) { return boost::math::ellint_3(k, v, phi, policies::policy<>()); } template -BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, const Policy& pol, const boost::math::true_type&) +BOOST_MATH_CUDA_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, const Policy& pol, const boost::math::true_type&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -341,7 +341,7 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_ } // namespace detail template -BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const Policy&) +BOOST_MATH_CUDA_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const Policy&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -356,14 +356,14 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ell } template -BOOST_MATH_GPU_ENABLED typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi) +BOOST_MATH_CUDA_ENABLED typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi) { typedef typename policies::is_policy::type tag_type; return detail::ellint_3(k, v, phi, tag_type()); } template -BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v) +BOOST_MATH_CUDA_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v) { return ellint_3(k, v, policies::policy<>()); } From 147a0c30d4b3d4b6fce0a071e94f519c2004e3e2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 10:47:53 -0400 Subject: [PATCH 32/44] Update docs --- doc/sf/ellint_carlson.qbk | 40 ++++++++++++------------ doc/sf/ellint_legendre.qbk | 64 +++++++++++++++++++------------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/doc/sf/ellint_carlson.qbk b/doc/sf/ellint_carlson.qbk index ca39cd6be..db4569746 100644 --- a/doc/sf/ellint_carlson.qbk +++ b/doc/sf/ellint_carlson.qbk @@ -17,10 +17,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z, const ``__Policy``&) }} // namespaces @@ -32,10 +32,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z, const ``__Policy``&) }} // namespaces @@ -47,10 +47,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p) template - ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p, const ``__Policy``&) }} // namespaces @@ -62,10 +62,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rc(T1 x, T2 y) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rc(T1 x, T2 y) template - ``__sf_result`` ellint_rc(T1 x, T2 y, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rc(T1 x, T2 y, const ``__Policy``&) }} // namespaces @@ -76,10 +76,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z, const ``__Policy``&) }} // namespaces @@ -98,10 +98,10 @@ when the arguments are of different types: otherwise the return is the same type as the arguments. template - ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z, const ``__Policy``&) Returns Carlson's Elliptic Integral ['R[sub F]]: @@ -113,10 +113,10 @@ one may be zero. Otherwise returns the result of __domain_error. [optional_policy] template - ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z, const ``__Policy``&) Returns Carlson's elliptic integral R[sub D]: @@ -128,10 +128,10 @@ zero, and that z >= 0. Otherwise returns the result of __domain_error. [optional_policy] template - ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p) template - ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p, const ``__Policy``&) Returns Carlson's elliptic integral R[sub J]: @@ -149,10 +149,10 @@ using the relation: [equation ellint17] template - ``__sf_result`` ellint_rc(T1 x, T2 y) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rc(T1 x, T2 y) template - ``__sf_result`` ellint_rc(T1 x, T2 y, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rc(T1 x, T2 y, const ``__Policy``&) Returns Carlson's elliptic integral R[sub C]: @@ -170,10 +170,10 @@ using the relation: [equation ellint18] template - ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z, const ``__Policy``&) Returns Carlson's elliptic integral ['R[sub G]:] diff --git a/doc/sf/ellint_legendre.qbk b/doc/sf/ellint_legendre.qbk index c780a9b01..dba7af60c 100644 --- a/doc/sf/ellint_legendre.qbk +++ b/doc/sf/ellint_legendre.qbk @@ -17,16 +17,16 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_1(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T1 k, T2 phi); template - ``__sf_result`` ellint_1(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T1 k, T2 phi, const ``__Policy``&); template - ``__sf_result`` ellint_1(T k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T k); template - ``__sf_result`` ellint_1(T k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T k, const ``__Policy``&); }} // namespaces @@ -42,10 +42,10 @@ when T1 and T2 are different types: when they are the same type then the result is the same type as the arguments. template - ``__sf_result`` ellint_1(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T1 k, T2 phi); template - ``__sf_result`` ellint_1(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T1 k, T2 phi, const ``__Policy``&); Returns the incomplete elliptic integral of the first kind ['F([phi], k)]: @@ -56,10 +56,10 @@ Requires k[super 2]sin[super 2](phi) < 1, otherwise returns the result of __doma [optional_policy] template - ``__sf_result`` ellint_1(T k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T k); template - ``__sf_result`` ellint_1(T k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T k, const ``__Policy``&); Returns the complete elliptic integral of the first kind ['K(k)]: @@ -123,16 +123,16 @@ and namespace boost { namespace math { template - ``__sf_result`` ellint_2(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T1 k, T2 phi); template - ``__sf_result`` ellint_2(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T1 k, T2 phi, const ``__Policy``&); template - ``__sf_result`` ellint_2(T k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T k); template - ``__sf_result`` ellint_2(T k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T k, const ``__Policy``&); }} // namespaces @@ -148,10 +148,10 @@ when T1 and T2 are different types: when they are the same type then the result is the same type as the arguments. template - ``__sf_result`` ellint_2(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T1 k, T2 phi); template - ``__sf_result`` ellint_2(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T1 k, T2 phi, const ``__Policy``&); Returns the incomplete elliptic integral of the second kind ['E([phi], k)]: @@ -162,10 +162,10 @@ Requires k[super 2]sin[super 2](phi) < 1, otherwise returns the result of __doma [optional_policy] template - ``__sf_result`` ellint_2(T k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T k); template - ``__sf_result`` ellint_2(T k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T k, const ``__Policy``&); Returns the complete elliptic integral of the second kind ['E(k)]: @@ -230,16 +230,16 @@ and namespace boost { namespace math { template - ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi); template - ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi, const ``__Policy``&); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi, const ``__Policy``&); template - ``__sf_result`` ellint_3(T1 k, T2 n); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n); template - ``__sf_result`` ellint_3(T1 k, T2 n, const ``__Policy``&); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, const ``__Policy``&); }} // namespaces @@ -255,10 +255,10 @@ when the arguments are of different types: when they are the same type then the is the same type as the arguments. template - ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi); template - ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi, const ``__Policy``&); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi, const ``__Policy``&); Returns the incomplete elliptic integral of the third kind ['[Pi](n, [phi], k)]: @@ -271,10 +271,10 @@ would be complex). [optional_policy] template - ``__sf_result`` ellint_3(T1 k, T2 n); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n); template - ``__sf_result`` ellint_3(T1 k, T2 n, const ``__Policy``&); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, const ``__Policy``&); Returns the complete elliptic integral of the first kind ['[Pi](n, k)]: @@ -355,16 +355,16 @@ and namespace boost { namespace math { template - ``__sf_result`` ellint_d(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, T2 phi); template - ``__sf_result`` ellint_d(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, T2 phi, const ``__Policy``&); template - ``__sf_result`` ellint_d(T1 k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k); template - ``__sf_result`` ellint_d(T1 k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, const ``__Policy``&); }} // namespaces @@ -378,10 +378,10 @@ when the arguments are of different types: when they are the same type then the is the same type as the arguments. template - ``__sf_result`` ellint_d(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, T2 phi); template - ``__sf_result`` ellint_3(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_3(T1 k, T2 phi, const ``__Policy``&); Returns the incomplete elliptic integral: @@ -394,10 +394,10 @@ would be complex). [optional_policy] template - ``__sf_result`` ellint_d(T1 k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k); template - ``__sf_result`` ellint_d(T1 k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, const ``__Policy``&); Returns the complete elliptic integral ['D(k) = D([pi]/2, k)] From 155e18642d7ffbf5ea677c2fa51ca5bb5afb92f7 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 11:24:35 -0400 Subject: [PATCH 33/44] Add GPU support to jacobi zeta --- include/boost/math/special_functions/jacobi_zeta.hpp | 10 ++++++---- include/boost/math/special_functions/math_fwd.hpp | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/boost/math/special_functions/jacobi_zeta.hpp b/include/boost/math/special_functions/jacobi_zeta.hpp index c4ba7d23d..8b6f80912 100644 --- a/include/boost/math/special_functions/jacobi_zeta.hpp +++ b/include/boost/math/special_functions/jacobi_zeta.hpp @@ -11,6 +11,8 @@ #pragma once #endif +#include +#include #include #include #include @@ -27,7 +29,7 @@ namespace detail{ // Elliptic integral - Jacobi Zeta template -T jacobi_zeta_imp(T phi, T k, const Policy& pol, T kp) +BOOST_MATH_GPU_ENABLED T jacobi_zeta_imp(T phi, T k, const Policy& pol, T kp) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -55,14 +57,14 @@ T jacobi_zeta_imp(T phi, T k, const Policy& pol, T kp) return invert ? T(-result) : result; } template -inline T jacobi_zeta_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T jacobi_zeta_imp(T phi, T k, const Policy& pol) { return jacobi_zeta_imp(phi, k, pol, T(1 - k * k)); } } // detail template -inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi, const Policy& pol) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -70,7 +72,7 @@ inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi, cons } template -inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi) { return boost::math::jacobi_zeta(k, phi, policies::policy<>()); } diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 44600422d..2da4a5ed6 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -412,10 +412,10 @@ namespace boost BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_d(T1 k, T2 phi, const Policy& pol); template - tools::promote_args_t jacobi_zeta(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t jacobi_zeta(T1 k, T2 phi); template - tools::promote_args_t jacobi_zeta(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t jacobi_zeta(T1 k, T2 phi, const Policy& pol); template tools::promote_args_t heuman_lambda(T1 k, T2 phi); @@ -1413,7 +1413,7 @@ namespace boost BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_d(T1 k, T2 phi){ return boost::math::ellint_d(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t jacobi_zeta(T1 k, T2 phi){ return boost::math::jacobi_zeta(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t jacobi_zeta(T1 k, T2 phi){ return boost::math::jacobi_zeta(k, phi, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t heuman_lambda(T1 k, T2 phi){ return boost::math::heuman_lambda(k, phi, Policy()); }\ From 58a79f5d70a496c7c1d78b614e5a0401e5b92128 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 11:24:50 -0400 Subject: [PATCH 34/44] Add CUDA testing of jacobi zeta --- test/cuda_jamfile | 2 + test/test_jacobi_zeta_double.cu | 120 ++++++++++++++++++++++++++++++++ test/test_jacobi_zeta_float.cu | 120 ++++++++++++++++++++++++++++++++ 3 files changed, 242 insertions(+) create mode 100644 test/test_jacobi_zeta_double.cu create mode 100644 test/test_jacobi_zeta_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index feb8e0742..e84fcaedc 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -326,6 +326,8 @@ run test_ellint_3_double.cu ; run test_ellint_3_float.cu ; run test_ellint_d_double.cu ; run test_ellint_d_float.cu ; +run test_jacobi_zeta_double.cu ; +run test_jacobi_zeta_float.cu ; run test_erf_double.cu ; run test_erf_float.cu ; diff --git a/test/test_jacobi_zeta_double.cu b/test/test_jacobi_zeta_double.cu new file mode 100644 index 000000000..8594da140 --- /dev/null +++ b/test/test_jacobi_zeta_double.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::jacobi_zeta(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::jacobi_zeta(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_jacobi_zeta_float.cu b/test/test_jacobi_zeta_float.cu new file mode 100644 index 000000000..7b473455a --- /dev/null +++ b/test/test_jacobi_zeta_float.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::jacobi_zeta(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::jacobi_zeta(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 6400e4490eb5fba193fd8ac3787f7bc18788248f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 11:29:48 -0400 Subject: [PATCH 35/44] Add NVRTC testing of jacobi zeta --- test/nvrtc_jamfile | 2 + test/test_jacobi_zeta_nvrtc_double.cpp | 190 +++++++++++++++++++++++++ test/test_jacobi_zeta_nvrtc_float.cpp | 190 +++++++++++++++++++++++++ 3 files changed, 382 insertions(+) create mode 100644 test/test_jacobi_zeta_nvrtc_double.cpp create mode 100644 test/test_jacobi_zeta_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 1285387fb..0f4bc77d8 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -330,6 +330,8 @@ run test_ellint_3_nvrtc_double.cpp ; run test_ellint_3_nvrtc_float.cpp ; run test_ellint_d_nvrtc_double.cpp ; run test_ellint_d_nvrtc_float.cpp ; +run test_jacobi_zeta_nvrtc_double.cpp ; +run test_jacobi_zeta_nvrtc_float.cpp ; run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; diff --git a/test/test_jacobi_zeta_nvrtc_double.cpp b/test/test_jacobi_zeta_nvrtc_double.cpp new file mode 100644 index 000000000..ded2e6657 --- /dev/null +++ b/test/test_jacobi_zeta_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_jacobi_zeta_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::jacobi_zeta(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_jacobi_zeta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_jacobi_zeta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_jacobi_zeta_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::jacobi_zeta(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_jacobi_zeta_nvrtc_float.cpp b/test/test_jacobi_zeta_nvrtc_float.cpp new file mode 100644 index 000000000..de52da118 --- /dev/null +++ b/test/test_jacobi_zeta_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_jacobi_zeta_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::jacobi_zeta(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_jacobi_zeta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_jacobi_zeta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_jacobi_zeta_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::jacobi_zeta(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 3e96d7e5a346b4fc37584e7cf81d61c44d64d829 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 11:39:34 -0400 Subject: [PATCH 36/44] Add SYCL testing of jacobi zeta --- test/sycl_jamfile | 1 + test/test_jacobi_zeta.cpp | 3 +++ test/test_jacobi_zeta.hpp | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index d475efafa..d59e486db 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -64,6 +64,7 @@ run test_cbrt.cpp ; run test_ellint_1.cpp ; run test_ellint_2.cpp ; run test_ellint_d.cpp ; +run test_jacobi_zeta.cpp ; run test_sign.cpp ; diff --git a/test/test_jacobi_zeta.cpp b/test/test_jacobi_zeta.cpp index 77f33efb1..c64f99580 100644 --- a/test/test_jacobi_zeta.cpp +++ b/test/test_jacobi_zeta.cpp @@ -4,7 +4,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_jacobi_zeta.hpp" // diff --git a/test/test_jacobi_zeta.hpp b/test/test_jacobi_zeta.hpp index 1aa72feb0..a39d3ba70 100644 --- a/test/test_jacobi_zeta.hpp +++ b/test/test_jacobi_zeta.hpp @@ -8,11 +8,17 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include #include //#include #include From d57d65e57150c46047eec4d6045c85011f59a4cb Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 12:27:39 -0400 Subject: [PATCH 37/44] Add GPU support to heuman_lambda --- .../math/special_functions/heuman_lambda.hpp | 22 ++++++++++++------- .../boost/math/special_functions/math_fwd.hpp | 6 ++--- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/boost/math/special_functions/heuman_lambda.hpp b/include/boost/math/special_functions/heuman_lambda.hpp index 0fbf4a980..05002725f 100644 --- a/include/boost/math/special_functions/heuman_lambda.hpp +++ b/include/boost/math/special_functions/heuman_lambda.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2015 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,6 +11,9 @@ #pragma once #endif +#include +#include +#include #include #include #include @@ -26,13 +30,13 @@ namespace detail{ // Elliptic integral - Jacobi Zeta template -T heuman_lambda_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T heuman_lambda_imp(T phi, T k, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::tools; using namespace boost::math::constants; - const char* function = "boost::math::heuman_lambda<%1%>(%1%, %1%)"; + constexpr auto function = "boost::math::heuman_lambda<%1%>(%1%, %1%)"; if(fabs(k) > 1) return policies::raise_domain_error(function, "We require |k| <= 1 but got k = %1%", k, pol); @@ -51,10 +55,10 @@ T heuman_lambda_imp(T phi, T k, const Policy& pol) } else { - typedef std::integral_constant::value&& std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 - > precision_tag_type; + typedef boost::math::integral_constant::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 + > precision_tag_type; T rkp = sqrt(kp); T ratio; @@ -63,7 +67,9 @@ T heuman_lambda_imp(T phi, T k, const Policy& pol) return policies::raise_domain_error(function, "When 1-k^2 == 1 then phi must be < Pi/2, but got phi = %1%", phi, pol); } else + { ratio = ellint_f_imp(phi, rkp, pol, k2) / ellint_k_imp(rkp, pol, k2); + } result = ratio + ellint_k_imp(k, pol, precision_tag_type()) * jacobi_zeta_imp(phi, rkp, pol, k2) / constants::half_pi(); } return result; @@ -72,7 +78,7 @@ T heuman_lambda_imp(T phi, T k, const Policy& pol) } // detail template -inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi, const Policy& pol) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -80,7 +86,7 @@ inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi, co } template -inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi) { return boost::math::heuman_lambda(k, phi, policies::policy<>()); } diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 2da4a5ed6..35f8dd1bb 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -418,10 +418,10 @@ namespace boost BOOST_MATH_GPU_ENABLED tools::promote_args_t jacobi_zeta(T1 k, T2 phi, const Policy& pol); template - tools::promote_args_t heuman_lambda(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t heuman_lambda(T1 k, T2 phi); template - tools::promote_args_t heuman_lambda(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t heuman_lambda(T1 k, T2 phi, const Policy& pol); namespace detail{ @@ -1416,7 +1416,7 @@ namespace boost BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t jacobi_zeta(T1 k, T2 phi){ return boost::math::jacobi_zeta(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t heuman_lambda(T1 k, T2 phi){ return boost::math::heuman_lambda(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t heuman_lambda(T1 k, T2 phi){ return boost::math::heuman_lambda(k, phi, Policy()); }\ \ template \ BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_1(T k){ return boost::math::ellint_1(k, Policy()); }\ From 8e5f338d975881e3f52de1afd7acc4fb27f5dcbe Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 12:27:50 -0400 Subject: [PATCH 38/44] Add NVRTC testing of heuman lambda --- test/nvrtc_jamfile | 3 + test/test_heumann_lambda_nvrtc_double.cpp | 190 ++++++++++++++++++++++ test/test_heumann_lambda_nvrtc_float.cpp | 190 ++++++++++++++++++++++ 3 files changed, 383 insertions(+) create mode 100644 test/test_heumann_lambda_nvrtc_double.cpp create mode 100644 test/test_heumann_lambda_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 0f4bc77d8..eb5da37c8 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -9,6 +9,9 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; +run test_heumann_lambda_nvrtc_double.cpp ; +run test_heumann_lambda_nvrtc_float.cpp ; + # Quad run test_exp_sinh_quad_nvrtc_float.cpp ; run test_exp_sinh_quad_nvrtc_double.cpp ; diff --git a/test/test_heumann_lambda_nvrtc_double.cpp b/test/test_heumann_lambda_nvrtc_double.cpp new file mode 100644 index 000000000..38c762fd5 --- /dev/null +++ b/test/test_heumann_lambda_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_heuman_lambda_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::heuman_lambda(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_heuman_lambda_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_heuman_lambda_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_heuman_lambda_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::heuman_lambda(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_heumann_lambda_nvrtc_float.cpp b/test/test_heumann_lambda_nvrtc_float.cpp new file mode 100644 index 000000000..5139b9d6f --- /dev/null +++ b/test/test_heumann_lambda_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_heuman_lambda_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::heuman_lambda(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_heuman_lambda_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_heuman_lambda_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_heuman_lambda_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::heuman_lambda(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 4a449e39f1a4b4233c50bd75194803a99d10f14a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 12:55:28 -0400 Subject: [PATCH 39/44] Add CUDA testing of heuman_lambda --- test/cuda_jamfile | 2 + test/test_heuman_lambda_double.cu | 120 ++++++++++++++++++++++++++++++ test/test_heuman_lambda_float.cu | 120 ++++++++++++++++++++++++++++++ 3 files changed, 242 insertions(+) create mode 100644 test/test_heuman_lambda_double.cu create mode 100644 test/test_heuman_lambda_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index e84fcaedc..34ac3b965 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -328,6 +328,8 @@ run test_ellint_d_double.cu ; run test_ellint_d_float.cu ; run test_jacobi_zeta_double.cu ; run test_jacobi_zeta_float.cu ; +run test_heuman_lambda_double.cu ; +run test_heuman_lambda_float.cu ; run test_erf_double.cu ; run test_erf_float.cu ; diff --git a/test/test_heuman_lambda_double.cu b/test/test_heuman_lambda_double.cu new file mode 100644 index 000000000..361dbe805 --- /dev/null +++ b/test/test_heuman_lambda_double.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::heuman_lambda(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::heuman_lambda(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_heuman_lambda_float.cu b/test/test_heuman_lambda_float.cu new file mode 100644 index 000000000..361dbe805 --- /dev/null +++ b/test/test_heuman_lambda_float.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::heuman_lambda(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::heuman_lambda(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 6f7ccf73d6ae357a26ab3d893923326fbd6ba595 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 13:31:33 -0400 Subject: [PATCH 40/44] Add SYCL testing of heuman lambda --- test/sycl_jamfile | 1 + test/test_heuman_lambda.cpp | 3 +++ test/test_heuman_lambda.hpp | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index d59e486db..ff6d84fde 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -65,6 +65,7 @@ run test_ellint_1.cpp ; run test_ellint_2.cpp ; run test_ellint_d.cpp ; run test_jacobi_zeta.cpp ; +run test_heuman_lambda.cpp ; run test_sign.cpp ; diff --git a/test/test_heuman_lambda.cpp b/test/test_heuman_lambda.cpp index 83709c635..cdcf39aa6 100644 --- a/test/test_heuman_lambda.cpp +++ b/test/test_heuman_lambda.cpp @@ -4,7 +4,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_heuman_lambda.hpp" // diff --git a/test/test_heuman_lambda.hpp b/test/test_heuman_lambda.hpp index 23720b2d0..6081dac48 100644 --- a/test/test_heuman_lambda.hpp +++ b/test/test_heuman_lambda.hpp @@ -8,11 +8,17 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include #include #include #include "functor.hpp" From 03f6f82175d2a2213099c02f177dd9dce76d0262 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 13:32:11 -0400 Subject: [PATCH 41/44] Add markers to docs --- doc/sf/ellint_legendre.qbk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/sf/ellint_legendre.qbk b/doc/sf/ellint_legendre.qbk index dba7af60c..50b633af9 100644 --- a/doc/sf/ellint_legendre.qbk +++ b/doc/sf/ellint_legendre.qbk @@ -463,10 +463,10 @@ using the relation: namespace boost { namespace math { template - ``__sf_result`` jacobi_zeta(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` jacobi_zeta(T1 k, T2 phi); template - ``__sf_result`` jacobi_zeta(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` jacobi_zeta(T1 k, T2 phi, const ``__Policy``&); }} // namespaces @@ -543,10 +543,10 @@ is [@../../example/jacobi_zeta_example.cpp jacobi_zeta_example.cpp]. namespace boost { namespace math { template - ``__sf_result`` heuman_lambda(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` heuman_lambda(T1 k, T2 phi); template - ``__sf_result`` heuman_lambda(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` heuman_lambda(T1 k, T2 phi, const ``__Policy``&); }} // namespaces From b7baa62f827a2cb2dd2135b9c76a6418b28e497b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 13:44:07 -0400 Subject: [PATCH 42/44] Add marker for CUDA only functions in the docs --- doc/overview/gpu.qbk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/overview/gpu.qbk b/doc/overview/gpu.qbk index b97b059a6..7fb27e645 100644 --- a/doc/overview/gpu.qbk +++ b/doc/overview/gpu.qbk @@ -3,8 +3,9 @@ [h4 GPU Support] Selected functions, distributions, tools, etc. support running on both host and devices. -These functions will have the annotation `BOOST_MATH_GPU_ENABLED` next to their individual documentation. -We test using CUDA (both NVCC and NVRTC) as well as SYCL to provide a wide range of support. +These functions will have the annotation `BOOST_MATH_GPU_ENABLED` or `BOOST_MATH_CUDA_ENABLED` next to their individual documentation. +Functions marked with `BOOST_MATH_GPU_ENABLED` are tested using CUDA (both NVCC and NVRTC) as well as SYCL to provide a wide range of support. +Functions marked with `BOOST_MATH_CUDA_ENABLED` are few, but due to its restrictions SYCL is unsupported. [h4 Policies] From 26a404b4d38235efe022844d7a6bfc2d2e00f421 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 15:12:40 -0400 Subject: [PATCH 43/44] Fix windows cases of overflow error --- test/test_ellint_1.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_ellint_1.cpp b/test/test_ellint_1.cpp index db82132a5..9366e4545 100644 --- a/test/test_ellint_1.cpp +++ b/test/test_ellint_1.cpp @@ -10,6 +10,10 @@ #include #endif +#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#endif + #include "test_ellint_1.hpp" // From 20bd10dbd6d6d411441e1843a80e978d88eded60 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 16:33:48 -0400 Subject: [PATCH 44/44] Fix CUDA only macro for NVRTC --- include/boost/math/tools/config.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 6d7af4d2b..99bc4ee27 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -789,6 +789,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return #define BOOST_MATH_HAS_GPU_SUPPORT #define BOOST_MATH_GPU_ENABLED __host__ __device__ +#define BOOST_MATH_CUDA_ENABLED __host__ __device__ #define BOOST_MATH_STATIC static #define BOOST_MATH_STATIC_LOCAL_VARIABLE