From 1324401c92f793fcd23b6a4a49f81ad49326e0db Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 14:04:07 -0400 Subject: [PATCH 1/3] Add GPU support to airy functions Add SYCL testing of ariy functions Add CUDA testing of airy functions Add NVRTC testing of airy functions Add GPU support to ellint rc Add GPU support to ellint rd Add GPU support to ellint rf Add GPU support to ellint rg Add GPU support to ellint rj Add GPU support to ellint d Add GPU support to ellint_1 Markup forward and add ellint_3 return type def for NVRTC platform Add CUDA testing of ellint 1 NVRTC fixes Add NVRTC testing of ellint_1 Add GPU support to ellint_2 Add CUDA testing of ellint_2 Fix NVRTC errors Add NVRTC testing of ellint_2 Add GPU support to atanh Add GPU support to ellint_3 Add NVRTC testing of ellint_3 Add CUDA testing of ellint_3 Replace use of static const char* Add SYCL testing of ellint_1 Add SYCL testing of ellint 2 with slight tolerance bump Remove recursion from ellint_rj Add ellint_d CUDA testing Add NVRTC testing of ellint_d Add SYCL testing of ellint_d Remove SYCL ellint_3 support Update docs Add GPU support to jacobi zeta Add CUDA testing of jacobi zeta Add NVRTC testing of jacobi zeta Add SYCL testing of jacobi zeta Add GPU support to heuman_lambda Add NVRTC testing of heuman lambda Add CUDA testing of heuman_lambda Add SYCL testing of heuman lambda Add markers to docs Add marker for CUDA only functions in the docs --- doc/overview/gpu.qbk | 5 +- doc/sf/ellint_carlson.qbk | 40 ++-- doc/sf/ellint_legendre.qbk | 72 +++---- include/boost/math/special_functions/airy.hpp | 52 ++--- .../boost/math/special_functions/atanh.hpp | 10 +- .../boost/math/special_functions/ellint_1.hpp | 65 +++--- .../boost/math/special_functions/ellint_2.hpp | 69 ++++--- .../boost/math/special_functions/ellint_3.hpp | 22 +- .../boost/math/special_functions/ellint_d.hpp | 21 +- .../math/special_functions/ellint_rc.hpp | 12 +- .../math/special_functions/ellint_rd.hpp | 33 +-- .../math/special_functions/ellint_rf.hpp | 23 ++- .../math/special_functions/ellint_rg.hpp | 23 +-- .../math/special_functions/ellint_rj.hpp | 116 +++++++---- .../math/special_functions/heuman_lambda.hpp | 22 +- .../math/special_functions/jacobi_zeta.hpp | 10 +- .../boost/math/special_functions/math_fwd.hpp | 154 +++++++------- include/boost/math/tools/workaround.hpp | 2 +- test/cuda_jamfile | 22 ++ test/nvrtc_jamfile | 23 +++ test/sycl_jamfile | 8 + test/test_airy.cpp | 12 ++ test/test_airy_ai_double.cu | 100 +++++++++ test/test_airy_ai_float.cu | 100 +++++++++ test/test_airy_ai_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_airy_ai_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_airy_ai_prime_double.cu | 100 +++++++++ test/test_airy_ai_prime_float.cu | 100 +++++++++ test/test_airy_ai_prime_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_airy_ai_prime_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_airy_bi_double.cu | 100 +++++++++ test/test_airy_bi_float.cu | 100 +++++++++ test/test_airy_bi_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_airy_bi_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_airy_bi_prime_double.cu | 100 +++++++++ test/test_airy_bi_prime_float.cu | 100 +++++++++ test/test_airy_bi_prime_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_airy_bi_prime_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_ellint_1.cpp | 3 + test/test_ellint_1.hpp | 6 + test/test_ellint_1_double.cu | 106 ++++++++++ test/test_ellint_1_float.cu | 100 +++++++++ test/test_ellint_1_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_ellint_1_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_ellint_2.cpp | 7 + test/test_ellint_2.hpp | 9 + test/test_ellint_2_double.cu | 100 +++++++++ test/test_ellint_2_float.cu | 100 +++++++++ test/test_ellint_2_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_ellint_2_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_ellint_3_double.cu | 104 ++++++++++ test/test_ellint_3_float.cu | 104 ++++++++++ test/test_ellint_3_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_ellint_3_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_ellint_d.cpp | 3 + test/test_ellint_d.hpp | 8 + test/test_ellint_d_double.cu | 104 ++++++++++ test/test_ellint_d_float.cu | 104 ++++++++++ test/test_ellint_d_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_ellint_d_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_heuman_lambda.cpp | 3 + test/test_heuman_lambda.hpp | 6 + test/test_heuman_lambda_double.cu | 120 +++++++++++ test/test_heuman_lambda_float.cu | 120 +++++++++++ test/test_heumann_lambda_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_heumann_lambda_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_jacobi_zeta.cpp | 3 + test/test_jacobi_zeta.hpp | 6 + test/test_jacobi_zeta_double.cu | 120 +++++++++++ test/test_jacobi_zeta_float.cu | 120 +++++++++++ test/test_jacobi_zeta_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_jacobi_zeta_nvrtc_float.cpp | 190 ++++++++++++++++++ 72 files changed, 6436 insertions(+), 336 deletions(-) create mode 100644 test/test_airy_ai_double.cu create mode 100644 test/test_airy_ai_float.cu create mode 100644 test/test_airy_ai_nvrtc_double.cpp create mode 100644 test/test_airy_ai_nvrtc_float.cpp create mode 100644 test/test_airy_ai_prime_double.cu create mode 100644 test/test_airy_ai_prime_float.cu create mode 100644 test/test_airy_ai_prime_nvrtc_double.cpp create mode 100644 test/test_airy_ai_prime_nvrtc_float.cpp create mode 100644 test/test_airy_bi_double.cu create mode 100644 test/test_airy_bi_float.cu create mode 100644 test/test_airy_bi_nvrtc_double.cpp create mode 100644 test/test_airy_bi_nvrtc_float.cpp create mode 100644 test/test_airy_bi_prime_double.cu create mode 100644 test/test_airy_bi_prime_float.cu create mode 100644 test/test_airy_bi_prime_nvrtc_double.cpp create mode 100644 test/test_airy_bi_prime_nvrtc_float.cpp create mode 100644 test/test_ellint_1_double.cu create mode 100644 test/test_ellint_1_float.cu create mode 100644 test/test_ellint_1_nvrtc_double.cpp create mode 100644 test/test_ellint_1_nvrtc_float.cpp create mode 100644 test/test_ellint_2_double.cu create mode 100644 test/test_ellint_2_float.cu create mode 100644 test/test_ellint_2_nvrtc_double.cpp create mode 100644 test/test_ellint_2_nvrtc_float.cpp create mode 100644 test/test_ellint_3_double.cu create mode 100644 test/test_ellint_3_float.cu create mode 100644 test/test_ellint_3_nvrtc_double.cpp create mode 100644 test/test_ellint_3_nvrtc_float.cpp create mode 100644 test/test_ellint_d_double.cu create mode 100644 test/test_ellint_d_float.cu create mode 100644 test/test_ellint_d_nvrtc_double.cpp create mode 100644 test/test_ellint_d_nvrtc_float.cpp create mode 100644 test/test_heuman_lambda_double.cu create mode 100644 test/test_heuman_lambda_float.cu create mode 100644 test/test_heumann_lambda_nvrtc_double.cpp create mode 100644 test/test_heumann_lambda_nvrtc_float.cpp create mode 100644 test/test_jacobi_zeta_double.cu create mode 100644 test/test_jacobi_zeta_float.cu create mode 100644 test/test_jacobi_zeta_nvrtc_double.cpp create mode 100644 test/test_jacobi_zeta_nvrtc_float.cpp diff --git a/doc/overview/gpu.qbk b/doc/overview/gpu.qbk index b97b059a60..7fb27e645e 100644 --- a/doc/overview/gpu.qbk +++ b/doc/overview/gpu.qbk @@ -3,8 +3,9 @@ [h4 GPU Support] Selected functions, distributions, tools, etc. support running on both host and devices. -These functions will have the annotation `BOOST_MATH_GPU_ENABLED` next to their individual documentation. -We test using CUDA (both NVCC and NVRTC) as well as SYCL to provide a wide range of support. +These functions will have the annotation `BOOST_MATH_GPU_ENABLED` or `BOOST_MATH_CUDA_ENABLED` next to their individual documentation. +Functions marked with `BOOST_MATH_GPU_ENABLED` are tested using CUDA (both NVCC and NVRTC) as well as SYCL to provide a wide range of support. +Functions marked with `BOOST_MATH_CUDA_ENABLED` are few, but due to its restrictions SYCL is unsupported. [h4 Policies] diff --git a/doc/sf/ellint_carlson.qbk b/doc/sf/ellint_carlson.qbk index ca39cd6bef..db45697463 100644 --- a/doc/sf/ellint_carlson.qbk +++ b/doc/sf/ellint_carlson.qbk @@ -17,10 +17,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z, const ``__Policy``&) }} // namespaces @@ -32,10 +32,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z, const ``__Policy``&) }} // namespaces @@ -47,10 +47,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p) template - ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p, const ``__Policy``&) }} // namespaces @@ -62,10 +62,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rc(T1 x, T2 y) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rc(T1 x, T2 y) template - ``__sf_result`` ellint_rc(T1 x, T2 y, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rc(T1 x, T2 y, const ``__Policy``&) }} // namespaces @@ -76,10 +76,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z, const ``__Policy``&) }} // namespaces @@ -98,10 +98,10 @@ when the arguments are of different types: otherwise the return is the same type as the arguments. template - ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rf(T1 x, T2 y, T3 z, const ``__Policy``&) Returns Carlson's Elliptic Integral ['R[sub F]]: @@ -113,10 +113,10 @@ one may be zero. Otherwise returns the result of __domain_error. [optional_policy] template - ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rd(T1 x, T2 y, T3 z, const ``__Policy``&) Returns Carlson's elliptic integral R[sub D]: @@ -128,10 +128,10 @@ zero, and that z >= 0. Otherwise returns the result of __domain_error. [optional_policy] template - ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p) template - ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rj(T1 x, T2 y, T3 z, T4 p, const ``__Policy``&) Returns Carlson's elliptic integral R[sub J]: @@ -149,10 +149,10 @@ using the relation: [equation ellint17] template - ``__sf_result`` ellint_rc(T1 x, T2 y) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rc(T1 x, T2 y) template - ``__sf_result`` ellint_rc(T1 x, T2 y, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rc(T1 x, T2 y, const ``__Policy``&) Returns Carlson's elliptic integral R[sub C]: @@ -170,10 +170,10 @@ using the relation: [equation ellint18] template - ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z) template - ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z, const ``__Policy``&) + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_rg(T1 x, T2 y, T3 z, const ``__Policy``&) Returns Carlson's elliptic integral ['R[sub G]:] diff --git a/doc/sf/ellint_legendre.qbk b/doc/sf/ellint_legendre.qbk index c780a9b019..50b633af9f 100644 --- a/doc/sf/ellint_legendre.qbk +++ b/doc/sf/ellint_legendre.qbk @@ -17,16 +17,16 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) namespace boost { namespace math { template - ``__sf_result`` ellint_1(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T1 k, T2 phi); template - ``__sf_result`` ellint_1(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T1 k, T2 phi, const ``__Policy``&); template - ``__sf_result`` ellint_1(T k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T k); template - ``__sf_result`` ellint_1(T k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T k, const ``__Policy``&); }} // namespaces @@ -42,10 +42,10 @@ when T1 and T2 are different types: when they are the same type then the result is the same type as the arguments. template - ``__sf_result`` ellint_1(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T1 k, T2 phi); template - ``__sf_result`` ellint_1(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T1 k, T2 phi, const ``__Policy``&); Returns the incomplete elliptic integral of the first kind ['F([phi], k)]: @@ -56,10 +56,10 @@ Requires k[super 2]sin[super 2](phi) < 1, otherwise returns the result of __doma [optional_policy] template - ``__sf_result`` ellint_1(T k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T k); template - ``__sf_result`` ellint_1(T k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_1(T k, const ``__Policy``&); Returns the complete elliptic integral of the first kind ['K(k)]: @@ -123,16 +123,16 @@ and namespace boost { namespace math { template - ``__sf_result`` ellint_2(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T1 k, T2 phi); template - ``__sf_result`` ellint_2(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T1 k, T2 phi, const ``__Policy``&); template - ``__sf_result`` ellint_2(T k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T k); template - ``__sf_result`` ellint_2(T k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T k, const ``__Policy``&); }} // namespaces @@ -148,10 +148,10 @@ when T1 and T2 are different types: when they are the same type then the result is the same type as the arguments. template - ``__sf_result`` ellint_2(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T1 k, T2 phi); template - ``__sf_result`` ellint_2(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T1 k, T2 phi, const ``__Policy``&); Returns the incomplete elliptic integral of the second kind ['E([phi], k)]: @@ -162,10 +162,10 @@ Requires k[super 2]sin[super 2](phi) < 1, otherwise returns the result of __doma [optional_policy] template - ``__sf_result`` ellint_2(T k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T k); template - ``__sf_result`` ellint_2(T k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_2(T k, const ``__Policy``&); Returns the complete elliptic integral of the second kind ['E(k)]: @@ -230,16 +230,16 @@ and namespace boost { namespace math { template - ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi); template - ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi, const ``__Policy``&); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi, const ``__Policy``&); template - ``__sf_result`` ellint_3(T1 k, T2 n); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n); template - ``__sf_result`` ellint_3(T1 k, T2 n, const ``__Policy``&); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, const ``__Policy``&); }} // namespaces @@ -255,10 +255,10 @@ when the arguments are of different types: when they are the same type then the is the same type as the arguments. template - ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi); template - ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi, const ``__Policy``&); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, T3 phi, const ``__Policy``&); Returns the incomplete elliptic integral of the third kind ['[Pi](n, [phi], k)]: @@ -271,10 +271,10 @@ would be complex). [optional_policy] template - ``__sf_result`` ellint_3(T1 k, T2 n); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n); template - ``__sf_result`` ellint_3(T1 k, T2 n, const ``__Policy``&); + BOOST_MATH_CUDA_ENABLED ``__sf_result`` ellint_3(T1 k, T2 n, const ``__Policy``&); Returns the complete elliptic integral of the first kind ['[Pi](n, k)]: @@ -355,16 +355,16 @@ and namespace boost { namespace math { template - ``__sf_result`` ellint_d(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, T2 phi); template - ``__sf_result`` ellint_d(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, T2 phi, const ``__Policy``&); template - ``__sf_result`` ellint_d(T1 k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k); template - ``__sf_result`` ellint_d(T1 k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, const ``__Policy``&); }} // namespaces @@ -378,10 +378,10 @@ when the arguments are of different types: when they are the same type then the is the same type as the arguments. template - ``__sf_result`` ellint_d(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, T2 phi); template - ``__sf_result`` ellint_3(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_3(T1 k, T2 phi, const ``__Policy``&); Returns the incomplete elliptic integral: @@ -394,10 +394,10 @@ would be complex). [optional_policy] template - ``__sf_result`` ellint_d(T1 k); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k); template - ``__sf_result`` ellint_d(T1 k, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ellint_d(T1 k, const ``__Policy``&); Returns the complete elliptic integral ['D(k) = D([pi]/2, k)] @@ -463,10 +463,10 @@ using the relation: namespace boost { namespace math { template - ``__sf_result`` jacobi_zeta(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` jacobi_zeta(T1 k, T2 phi); template - ``__sf_result`` jacobi_zeta(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` jacobi_zeta(T1 k, T2 phi, const ``__Policy``&); }} // namespaces @@ -543,10 +543,10 @@ is [@../../example/jacobi_zeta_example.cpp jacobi_zeta_example.cpp]. namespace boost { namespace math { template - ``__sf_result`` heuman_lambda(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED ``__sf_result`` heuman_lambda(T1 k, T2 phi); template - ``__sf_result`` heuman_lambda(T1 k, T2 phi, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` heuman_lambda(T1 k, T2 phi, const ``__Policy``&); }} // namespaces diff --git a/include/boost/math/special_functions/airy.hpp b/include/boost/math/special_functions/airy.hpp index 06eee92383..65114089a6 100644 --- a/include/boost/math/special_functions/airy.hpp +++ b/include/boost/math/special_functions/airy.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2012. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt @@ -7,19 +8,24 @@ #ifndef BOOST_MATH_AIRY_HPP #define BOOST_MATH_AIRY_HPP -#include +#include +#include +#include +#include #include #include #include #include #include +#include +#include namespace boost{ namespace math{ namespace detail{ template -T airy_ai_imp(T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_ai_imp(T x, const Policy& pol) { BOOST_MATH_STD_USING @@ -57,7 +63,7 @@ T airy_ai_imp(T x, const Policy& pol) } template -T airy_bi_imp(T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_bi_imp(T x, const Policy& pol) { BOOST_MATH_STD_USING @@ -90,7 +96,7 @@ T airy_bi_imp(T x, const Policy& pol) } template -T airy_ai_prime_imp(T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_ai_prime_imp(T x, const Policy& pol) { BOOST_MATH_STD_USING @@ -125,7 +131,7 @@ T airy_ai_prime_imp(T x, const Policy& pol) } template -T airy_bi_prime_imp(T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_bi_prime_imp(T x, const Policy& pol) { BOOST_MATH_STD_USING @@ -156,7 +162,7 @@ T airy_bi_prime_imp(T x, const Policy& pol) } template -T airy_ai_zero_imp(int m, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_ai_zero_imp(int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for log, sqrt. @@ -209,7 +215,7 @@ T airy_ai_zero_imp(int m, const Policy& pol) } template -T airy_bi_zero_imp(int m, const Policy& pol) +BOOST_MATH_GPU_ENABLED T airy_bi_zero_imp(int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for log, sqrt. @@ -263,7 +269,7 @@ T airy_bi_zero_imp(int m, const Policy& pol) } // namespace detail template -inline typename tools::promote_args::type airy_ai(T x, const Policy&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_ai(T x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename tools::promote_args::type result_type; @@ -279,13 +285,13 @@ inline typename tools::promote_args::type airy_ai(T x, const Policy&) } template -inline typename tools::promote_args::type airy_ai(T x) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_ai(T x) { return airy_ai(x, policies::policy<>()); } template -inline typename tools::promote_args::type airy_bi(T x, const Policy&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_bi(T x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename tools::promote_args::type result_type; @@ -301,13 +307,13 @@ inline typename tools::promote_args::type airy_bi(T x, const Policy&) } template -inline typename tools::promote_args::type airy_bi(T x) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_bi(T x) { return airy_bi(x, policies::policy<>()); } template -inline typename tools::promote_args::type airy_ai_prime(T x, const Policy&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_ai_prime(T x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename tools::promote_args::type result_type; @@ -323,13 +329,13 @@ inline typename tools::promote_args::type airy_ai_prime(T x, const Policy&) } template -inline typename tools::promote_args::type airy_ai_prime(T x) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_ai_prime(T x) { return airy_ai_prime(x, policies::policy<>()); } template -inline typename tools::promote_args::type airy_bi_prime(T x, const Policy&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_bi_prime(T x, const Policy&) { BOOST_FPU_EXCEPTION_GUARD typedef typename tools::promote_args::type result_type; @@ -345,13 +351,13 @@ inline typename tools::promote_args::type airy_bi_prime(T x, const Policy&) } template -inline typename tools::promote_args::type airy_bi_prime(T x) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type airy_bi_prime(T x) { return airy_bi_prime(x, policies::policy<>()); } template -inline T airy_ai_zero(int m, const Policy& /*pol*/) +BOOST_MATH_GPU_ENABLED inline T airy_ai_zero(int m, const Policy& /*pol*/) { BOOST_FPU_EXCEPTION_GUARD typedef typename policies::evaluation::type value_type; @@ -371,13 +377,13 @@ inline T airy_ai_zero(int m, const Policy& /*pol*/) } template -inline T airy_ai_zero(int m) +BOOST_MATH_GPU_ENABLED inline T airy_ai_zero(int m) { return airy_ai_zero(m, policies::policy<>()); } template -inline OutputIterator airy_ai_zero( +BOOST_MATH_GPU_ENABLED inline OutputIterator airy_ai_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it, @@ -399,7 +405,7 @@ inline OutputIterator airy_ai_zero( } template -inline OutputIterator airy_ai_zero( +BOOST_MATH_GPU_ENABLED inline OutputIterator airy_ai_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it) @@ -408,7 +414,7 @@ inline OutputIterator airy_ai_zero( } template -inline T airy_bi_zero(int m, const Policy& /*pol*/) +BOOST_MATH_GPU_ENABLED inline T airy_bi_zero(int m, const Policy& /*pol*/) { BOOST_FPU_EXCEPTION_GUARD typedef typename policies::evaluation::type value_type; @@ -428,13 +434,13 @@ inline T airy_bi_zero(int m, const Policy& /*pol*/) } template -inline T airy_bi_zero(int m) +BOOST_MATH_GPU_ENABLED inline T airy_bi_zero(int m) { return airy_bi_zero(m, policies::policy<>()); } template -inline OutputIterator airy_bi_zero( +BOOST_MATH_GPU_ENABLED inline OutputIterator airy_bi_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it, @@ -456,7 +462,7 @@ inline OutputIterator airy_bi_zero( } template -inline OutputIterator airy_bi_zero( +BOOST_MATH_GPU_ENABLED inline OutputIterator airy_bi_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it) diff --git a/include/boost/math/special_functions/atanh.hpp b/include/boost/math/special_functions/atanh.hpp index 543fb5fce3..9d73e568c0 100644 --- a/include/boost/math/special_functions/atanh.hpp +++ b/include/boost/math/special_functions/atanh.hpp @@ -15,7 +15,7 @@ #pragma once #endif -#include +#include #include #include #include @@ -33,10 +33,10 @@ namespace boost // This is the main fare template - inline T atanh_imp(const T x, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline T atanh_imp(const T x, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::atanh<%1%>(%1%)"; + constexpr auto function = "boost::math::atanh<%1%>(%1%)"; if(x < -1) { @@ -87,7 +87,7 @@ namespace boost } template - inline typename tools::promote_args::type atanh(T x, const Policy&) + BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type atanh(T x, const Policy&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -102,7 +102,7 @@ namespace boost "boost::math::atanh<%1%>(%1%)"); } template - inline typename tools::promote_args::type atanh(T x) + BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type atanh(T x) { return boost::math::atanh(x, policies::policy<>()); } diff --git a/include/boost/math/special_functions/ellint_1.hpp b/include/boost/math/special_functions/ellint_1.hpp index f7fbbce40c..96c7c9e9b9 100644 --- a/include/boost/math/special_functions/ellint_1.hpp +++ b/include/boost/math/special_functions/ellint_1.hpp @@ -20,6 +20,7 @@ #endif #include +#include #include #include #include @@ -33,28 +34,28 @@ namespace boost { namespace math { template -typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& pol); +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& pol); namespace detail{ template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&); template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, T one_minus_k2); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, T one_minus_k2); // Elliptic integral (Legendre form) of the first kind template -T ellint_f_imp(T phi, T k, const Policy& pol, T one_minus_k2) +BOOST_MATH_GPU_ENABLED T ellint_f_imp(T phi, T k, const Policy& pol, T one_minus_k2) { BOOST_MATH_STD_USING using namespace boost::math::tools; using namespace boost::math::constants; - static const char* function = "boost::math::ellint_f<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::ellint_f<%1%>(%1%,%1%)"; BOOST_MATH_INSTRUMENT_VARIABLE(phi); BOOST_MATH_INSTRUMENT_VARIABLE(k); BOOST_MATH_INSTRUMENT_VARIABLE(function); @@ -151,19 +152,19 @@ T ellint_f_imp(T phi, T k, const Policy& pol, T one_minus_k2) } template -inline T ellint_f_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T ellint_f_imp(T phi, T k, const Policy& pol) { return ellint_f_imp(phi, k, pol, T(1 - k * k)); } // Complete elliptic integral (Legendre form) of the first kind template -T ellint_k_imp(T k, const Policy& pol, T one_minus_k2) +BOOST_MATH_GPU_ENABLED T ellint_k_imp(T k, const Policy& pol, T one_minus_k2) { BOOST_MATH_STD_USING using namespace boost::math::tools; - static const char* function = "boost::math::ellint_k<%1%>(%1%)"; + constexpr auto function = "boost::math::ellint_k<%1%>(%1%)"; if (abs(k) > 1) { @@ -181,7 +182,7 @@ T ellint_k_imp(T k, const Policy& pol, T one_minus_k2) return value; } template -inline T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED inline T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&) { return ellint_k_imp(k, pol, T(1 - k * k)); } @@ -203,9 +204,9 @@ inline T ellint_k_imp(T k, const Policy& pol, std::integral_constant con // archived in the code below), but was found to have slightly higher error rates. // template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&) { - using std::abs; + BOOST_MATH_STD_USING using namespace boost::math::tools; T m = k * k; @@ -456,7 +457,7 @@ BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_cons // This handles all cases where m > 0.9, // including all error handling: // - return ellint_k_imp(k, pol, std::integral_constant()); + return ellint_k_imp(k, pol, boost::math::integral_constant()); #if 0 else { @@ -476,9 +477,9 @@ BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_cons } } template -BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, boost::math::integral_constant const&) { - using std::abs; + BOOST_MATH_STD_USING using namespace boost::math::tools; T m = k * k; @@ -757,44 +758,37 @@ BOOST_MATH_FORCEINLINE T ellint_k_imp(T k, const Policy& pol, std::integral_cons // All cases where m > 0.9 // including all error handling: // - return ellint_k_imp(k, pol, std::integral_constant()); + return ellint_k_imp(k, pol, boost::math::integral_constant()); } } template -typename tools::promote_args::type ellint_1(T k, const Policy& pol, const std::true_type&) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T k, const Policy& pol, const boost::math::true_type&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; - typedef std::integral_constant::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 #endif > precision_tag_type; return policies::checked_narrowing_cast(detail::ellint_k_imp(static_cast(k), pol, precision_tag_type()), "boost::math::ellint_1<%1%>(%1%)"); } template -typename tools::promote_args::type ellint_1(T1 k, T2 phi, const std::false_type&) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, T2 phi, const boost::math::false_type&) { return boost::math::ellint_1(k, phi, policies::policy<>()); } -} - -// Complete elliptic integral (Legendre form) of the first kind -template -typename tools::promote_args::type ellint_1(T k) -{ - return ellint_1(k, policies::policy<>()); -} +} // namespace detail // Elliptic integral (Legendre form) of the first kind template -typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& pol) // LCOV_EXCL_LINE gcc misses this but sees the function body, strange! +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& pol) // LCOV_EXCL_LINE gcc misses this but sees the function body, strange! { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -802,12 +796,19 @@ typename tools::promote_args::type ellint_1(T1 k, T2 phi, const Policy& } template -typename tools::promote_args::type ellint_1(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T1 k, T2 phi) { typedef typename policies::is_policy::type tag_type; return detail::ellint_1(k, phi, tag_type()); } +// Complete elliptic integral (Legendre form) of the first kind +template +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_1(T k) +{ + return ellint_1(k, policies::policy<>()); +} + }} // namespaces #endif // BOOST_MATH_ELLINT_1_HPP diff --git a/include/boost/math/special_functions/ellint_2.hpp b/include/boost/math/special_functions/ellint_2.hpp index 5e2552ceca..0cc1fa0944 100644 --- a/include/boost/math/special_functions/ellint_2.hpp +++ b/include/boost/math/special_functions/ellint_2.hpp @@ -19,6 +19,9 @@ #pragma once #endif +#include +#include +#include #include #include #include @@ -34,20 +37,20 @@ namespace boost { namespace math { template -typename tools::promote_args::type ellint_2(T1 k, T2 phi, const Policy& pol); +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, T2 phi, const Policy& pol); namespace detail{ template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const std::integral_constant&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const boost::math::integral_constant&); template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const std::integral_constant&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const boost::math::integral_constant&); template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const std::integral_constant&); +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, const boost::math::integral_constant&); // Elliptic integral (Legendre form) of the second kind template -T ellint_e_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_e_imp(T phi, T k, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -72,9 +75,9 @@ T ellint_e_imp(T phi, T k, const Policy& pol) } else if(phi > 1 / tools::epsilon()) { - typedef std::integral_constant::value&& std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 + typedef boost::math::integral_constant::value&& boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 > precision_tag_type; // Phi is so large that phi%pi is necessarily zero (or garbage), // just return the second part of the duplication formula: @@ -139,9 +142,9 @@ T ellint_e_imp(T phi, T k, const Policy& pol) } if (m != 0) { - typedef std::integral_constant::value&& std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 + typedef boost::math::integral_constant::value&& boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 > precision_tag_type; result += m * ellint_e_imp(k, pol, precision_tag_type()); } @@ -151,7 +154,7 @@ T ellint_e_imp(T phi, T k, const Policy& pol) // Complete elliptic integral (Legendre form) of the second kind template -T ellint_e_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED T ellint_e_imp(T k, const Policy& pol, boost::math::integral_constant const&) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -189,9 +192,9 @@ T ellint_e_imp(T k, const Policy& pol, std::integral_constant const&) // existing routines. // template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, boost::math::integral_constant const&) { - using std::abs; + BOOST_MATH_STD_USING using namespace boost::math::tools; T m = k * k; @@ -424,13 +427,13 @@ BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, std::integral_cons // All cases where m > 0.9 // including all error handling: // - return ellint_e_imp(k, pol, std::integral_constant()); + return ellint_e_imp(k, pol, boost::math::integral_constant()); } } template -BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, std::integral_constant const&) +BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, boost::math::integral_constant const&) { - using std::abs; + BOOST_MATH_STD_USING using namespace boost::math::tools; T m = k * k; @@ -697,54 +700,56 @@ BOOST_MATH_FORCEINLINE T ellint_e_imp(T k, const Policy& pol, std::integral_cons // All cases where m > 0.9 // including all error handling: // - return ellint_e_imp(k, pol, std::integral_constant()); + return ellint_e_imp(k, pol, boost::math::integral_constant()); } } template -typename tools::promote_args::type ellint_2(T k, const Policy& pol, const std::true_type&) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T k, const Policy& pol, const boost::math::true_type&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; - typedef std::integral_constant::value&& std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 + typedef boost::math::integral_constant::value&& boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 > precision_tag_type; return policies::checked_narrowing_cast(detail::ellint_e_imp(static_cast(k), pol, precision_tag_type()), "boost::math::ellint_2<%1%>(%1%)"); } // Elliptic integral (Legendre form) of the second kind template -typename tools::promote_args::type ellint_2(T1 k, T2 phi, const std::false_type&) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, T2 phi, const boost::math::false_type&) { return boost::math::ellint_2(k, phi, policies::policy<>()); } } // detail -// Complete elliptic integral (Legendre form) of the second kind -template -typename tools::promote_args::type ellint_2(T k) -{ - return ellint_2(k, policies::policy<>()); -} - // Elliptic integral (Legendre form) of the second kind template -typename tools::promote_args::type ellint_2(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, T2 phi) { typedef typename policies::is_policy::type tag_type; return detail::ellint_2(k, phi, tag_type()); } template -typename tools::promote_args::type ellint_2(T1 k, T2 phi, const Policy& pol) // LCOV_EXCL_LINE gcc misses this but sees the function body, strange! +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T1 k, T2 phi, const Policy& pol) // LCOV_EXCL_LINE gcc misses this but sees the function body, strange! { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; return policies::checked_narrowing_cast(detail::ellint_e_imp(static_cast(phi), static_cast(k), pol), "boost::math::ellint_2<%1%>(%1%,%1%)"); } + +// Complete elliptic integral (Legendre form) of the second kind +template +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_2(T k) +{ + return ellint_2(k, policies::policy<>()); +} + + }} // namespaces #endif // BOOST_MATH_ELLINT_2_HPP diff --git a/include/boost/math/special_functions/ellint_3.hpp b/include/boost/math/special_functions/ellint_3.hpp index 33acc545dc..b8df7e2645 100644 --- a/include/boost/math/special_functions/ellint_3.hpp +++ b/include/boost/math/special_functions/ellint_3.hpp @@ -18,6 +18,8 @@ #pragma once #endif +#include +#include #include #include #include @@ -38,16 +40,16 @@ namespace boost { namespace math { namespace detail{ template -T ellint_pi_imp(T v, T k, T vc, const Policy& pol); +BOOST_MATH_CUDA_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol); // Elliptic integral (Legendre form) of the third kind template -T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol) +BOOST_MATH_CUDA_ENABLED T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol) { // Note vc = 1-v presumably without cancellation error. BOOST_MATH_STD_USING - static const char* function = "boost::math::ellint_3<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_3<%1%>(%1%,%1%,%1%)"; T sphi = sin(fabs(phi)); @@ -270,13 +272,13 @@ T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol) // Complete elliptic integral (Legendre form) of the third kind template -T ellint_pi_imp(T v, T k, T vc, const Policy& pol) +BOOST_MATH_CUDA_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol) { // Note arg vc = 1-v, possibly without cancellation errors BOOST_MATH_STD_USING using namespace boost::math::tools; - static const char* function = "boost::math::ellint_pi<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::ellint_pi<%1%>(%1%,%1%)"; if (abs(k) >= 1) { @@ -318,13 +320,13 @@ T ellint_pi_imp(T v, T k, T vc, const Policy& pol) } template -inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const std::false_type&) +BOOST_MATH_CUDA_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const boost::math::false_type&) { return boost::math::ellint_3(k, v, phi, policies::policy<>()); } template -inline typename tools::promote_args::type ellint_3(T1 k, T2 v, const Policy& pol, const std::true_type&) +BOOST_MATH_CUDA_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, const Policy& pol, const boost::math::true_type&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -339,7 +341,7 @@ inline typename tools::promote_args::type ellint_3(T1 k, T2 v, const Pol } // namespace detail template -inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const Policy&) +BOOST_MATH_CUDA_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 phi, const Policy&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -354,14 +356,14 @@ inline typename tools::promote_args::type ellint_3(T1 k, T2 v, T3 ph } template -typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi) +BOOST_MATH_CUDA_ENABLED typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi) { typedef typename policies::is_policy::type tag_type; return detail::ellint_3(k, v, phi, tag_type()); } template -inline typename tools::promote_args::type ellint_3(T1 k, T2 v) +BOOST_MATH_CUDA_ENABLED inline typename tools::promote_args::type ellint_3(T1 k, T2 v) { return ellint_3(k, v, policies::policy<>()); } diff --git a/include/boost/math/special_functions/ellint_d.hpp b/include/boost/math/special_functions/ellint_d.hpp index da1e87ba3e..f5a8491f5a 100644 --- a/include/boost/math/special_functions/ellint_d.hpp +++ b/include/boost/math/special_functions/ellint_d.hpp @@ -1,5 +1,6 @@ // Copyright (c) 2006 Xiaogang Zhang // Copyright (c) 2006 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -18,6 +19,8 @@ #pragma once #endif +#include +#include #include #include #include @@ -33,16 +36,16 @@ namespace boost { namespace math { template -typename tools::promote_args::type ellint_d(T1 k, T2 phi, const Policy& pol); +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ellint_d(T1 k, T2 phi, const Policy& pol); namespace detail{ template -T ellint_d_imp(T k, const Policy& pol); +BOOST_MATH_GPU_ENABLED T ellint_d_imp(T k, const Policy& pol); // Elliptic integral (Legendre form) of the second kind template -T ellint_d_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_d_imp(T phi, T k, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -113,7 +116,7 @@ T ellint_d_imp(T phi, T k, const Policy& pol) // Complete elliptic integral (Legendre form) of the second kind template -T ellint_d_imp(T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_d_imp(T k, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -135,7 +138,7 @@ T ellint_d_imp(T k, const Policy& pol) } template -inline typename tools::promote_args::type ellint_d(T k, const Policy& pol, const std::true_type&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T k, const Policy& pol, const boost::math::true_type&) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -144,7 +147,7 @@ inline typename tools::promote_args::type ellint_d(T k, const Policy& pol, co // Elliptic integral (Legendre form) of the second kind template -inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const std::false_type&) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const boost::math::false_type&) { return boost::math::ellint_d(k, phi, policies::policy<>()); } @@ -153,21 +156,21 @@ inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const s // Complete elliptic integral (Legendre form) of the second kind template -inline typename tools::promote_args::type ellint_d(T k) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T k) { return ellint_d(k, policies::policy<>()); } // Elliptic integral (Legendre form) of the second kind template -inline typename tools::promote_args::type ellint_d(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T1 k, T2 phi) { typedef typename policies::is_policy::type tag_type; return detail::ellint_d(k, phi, tag_type()); } template -inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_d(T1 k, T2 phi, const Policy& pol) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; diff --git a/include/boost/math/special_functions/ellint_rc.hpp b/include/boost/math/special_functions/ellint_rc.hpp index 2f9a1f8cfb..ae3c6375e5 100644 --- a/include/boost/math/special_functions/ellint_rc.hpp +++ b/include/boost/math/special_functions/ellint_rc.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang, 2015 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -18,12 +19,11 @@ #pragma once #endif -#include #include +#include #include #include #include -#include // Carlson's degenerate elliptic integral // R_C(x, y) = R_F(x, y, y) = 0.5 * \int_{0}^{\infty} (t+x)^{-1/2} (t+y)^{-1} dt @@ -32,11 +32,11 @@ namespace boost { namespace math { namespace detail{ template -T ellint_rc_imp(T x, T y, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_rc_imp(T x, T y, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::ellint_rc<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rc<%1%>(%1%,%1%)"; if(x < 0) { @@ -88,7 +88,7 @@ T ellint_rc_imp(T x, T y, const Policy& pol) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rc(T1 x, T2 y, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -100,7 +100,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rc(T1 x, T2 y) { return ellint_rc(x, y, policies::policy<>()); diff --git a/include/boost/math/special_functions/ellint_rd.hpp b/include/boost/math/special_functions/ellint_rd.hpp index 2a79e54ca2..f2a33adc46 100644 --- a/include/boost/math/special_functions/ellint_rd.hpp +++ b/include/boost/math/special_functions/ellint_rd.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang, 2015 John Maddock. +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -16,10 +17,10 @@ #pragma once #endif +#include +#include #include #include -#include -#include #include // Carlson's elliptic integral of the second kind @@ -29,12 +30,11 @@ namespace boost { namespace math { namespace detail{ template -T ellint_rd_imp(T x, T y, T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_rd_imp(T x, T y, T z, const Policy& pol) { BOOST_MATH_STD_USING - using std::swap; - static const char* function = "boost::math::ellint_rd<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rd<%1%>(%1%,%1%,%1%)"; if(x < 0) { @@ -55,9 +55,11 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) // // Special cases from http://dlmf.nist.gov/19.20#iv // - using std::swap; + if(x == z) - swap(x, y); + { + BOOST_MATH_GPU_SAFE_SWAP(x, y); + } if(y == z) { if(x == y) @@ -70,19 +72,21 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) } else { - if((std::max)(x, y) / (std::min)(x, y) > T(1.3)) + if(BOOST_MATH_GPU_SAFE_MAX(x, y) / BOOST_MATH_GPU_SAFE_MIN(x, y) > T(1.3)) return 3 * (ellint_rc_imp(x, y, pol) - sqrt(x) / y) / (2 * (y - x)); // Otherwise fall through to avoid cancellation in the above (RC(x,y) -> 1/x^0.5 as x -> y) } } if(x == y) { - if((std::max)(x, z) / (std::min)(x, z) > T(1.3)) + if(BOOST_MATH_GPU_SAFE_MAX(x, z) / BOOST_MATH_GPU_SAFE_MIN(x, z) > T(1.3)) return 3 * (ellint_rc_imp(z, x, pol) - 1 / sqrt(z)) / (z - x); // Otherwise fall through to avoid cancellation in the above (RC(x,y) -> 1/x^0.5 as x -> y) } if(y == 0) - swap(x, y); + { + BOOST_MATH_GPU_SAFE_SWAP(x, y); + } if(x == 0) { // @@ -102,7 +106,8 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) xn = (xn + yn) / 2; yn = t; sum_pow *= 2; - sum += sum_pow * boost::math::pow<2>(xn - yn); + const auto temp = (xn - yn); + sum += sum_pow * temp * temp; } T RF = constants::pi() / (xn + yn); // @@ -128,7 +133,7 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) T An = (x + y + 3 * z) / 5; T A0 = An; // This has an extra 1.2 fudge factor which is really only needed when x, y and z are close in magnitude: - T Q = pow(tools::epsilon() / 4, -T(1) / 8) * (std::max)((std::max)(An - x, An - y), An - z) * 1.2f; + T Q = pow(tools::epsilon() / 4, -T(1) / 8) * BOOST_MATH_GPU_SAFE_MAX(BOOST_MATH_GPU_SAFE_MAX(An - x, An - y), An - z) * 1.2f; BOOST_MATH_INSTRUMENT_VARIABLE(Q); T lambda, rx, ry, rz; unsigned k = 0; @@ -177,7 +182,7 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rd(T1 x, T2 y, T3 z, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -190,7 +195,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rd(T1 x, T2 y, T3 z) { return ellint_rd(x, y, z, policies::policy<>()); diff --git a/include/boost/math/special_functions/ellint_rf.hpp b/include/boost/math/special_functions/ellint_rf.hpp index c781ac0353..eb1c2b6e71 100644 --- a/include/boost/math/special_functions/ellint_rf.hpp +++ b/include/boost/math/special_functions/ellint_rf.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang, 2015 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -17,8 +18,9 @@ #pragma once #endif -#include #include +#include +#include #include #include #include @@ -30,21 +32,20 @@ namespace boost { namespace math { namespace detail{ template - T ellint_rf_imp(T x, T y, T z, const Policy& pol) + BOOST_MATH_GPU_ENABLED T ellint_rf_imp(T x, T y, T z, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math; - using std::swap; - static const char* function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)"; if(x < 0 || y < 0 || z < 0) { - return policies::raise_domain_error(function, "domain error, all arguments must be non-negative, only sensible result is %1%.", std::numeric_limits::quiet_NaN(), pol); + return policies::raise_domain_error(function, "domain error, all arguments must be non-negative, only sensible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); } if(x + y == 0 || y + z == 0 || z + x == 0) { - return policies::raise_domain_error(function, "domain error, at most one argument can be zero, only sensible result is %1%.", std::numeric_limits::quiet_NaN(), pol); + return policies::raise_domain_error(function, "domain error, at most one argument can be zero, only sensible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); } // // Special cases from http://dlmf.nist.gov/19.20#i @@ -80,9 +81,9 @@ namespace boost { namespace math { namespace detail{ return ellint_rc_imp(x, y, pol); } if(x == 0) - swap(x, z); + BOOST_MATH_GPU_SAFE_SWAP(x, z); else if(y == 0) - swap(y, z); + BOOST_MATH_GPU_SAFE_SWAP(y, z); if(z == 0) { // @@ -105,7 +106,7 @@ namespace boost { namespace math { namespace detail{ T zn = z; T An = (x + y + z) / 3; T A0 = An; - T Q = pow(3 * boost::math::tools::epsilon(), T(-1) / 8) * (std::max)((std::max)(fabs(An - xn), fabs(An - yn)), fabs(An - zn)); + T Q = pow(3 * boost::math::tools::epsilon(), T(-1) / 8) * BOOST_MATH_GPU_SAFE_MAX(BOOST_MATH_GPU_SAFE_MAX(fabs(An - xn), fabs(An - yn)), fabs(An - zn)); T fn = 1; @@ -143,7 +144,7 @@ namespace boost { namespace math { namespace detail{ } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rf(T1 x, T2 y, T3 z, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -156,7 +157,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rf(T1 x, T2 y, T3 z) { return ellint_rf(x, y, z, policies::policy<>()); diff --git a/include/boost/math/special_functions/ellint_rg.hpp b/include/boost/math/special_functions/ellint_rg.hpp index 051c104bca..8a7f706ac0 100644 --- a/include/boost/math/special_functions/ellint_rg.hpp +++ b/include/boost/math/special_functions/ellint_rg.hpp @@ -10,8 +10,8 @@ #pragma once #endif -#include #include +#include #include #include #include @@ -21,27 +21,26 @@ namespace boost { namespace math { namespace detail{ template - T ellint_rg_imp(T x, T y, T z, const Policy& pol) + BOOST_MATH_GPU_ENABLED T ellint_rg_imp(T x, T y, T z, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)"; if(x < 0 || y < 0 || z < 0) { - return policies::raise_domain_error(function, "domain error, all arguments must be non-negative, only sensible result is %1%.", std::numeric_limits::quiet_NaN(), pol); + return policies::raise_domain_error(function, "domain error, all arguments must be non-negative, only sensible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); } // // Function is symmetric in x, y and z, but we require // (x - z)(y - z) >= 0 to avoid cancellation error in the result // which implies (for example) x >= z >= y // - using std::swap; if(x < y) - swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); if(x < z) - swap(x, z); + BOOST_MATH_GPU_SAFE_SWAP(x, z); if(y > z) - swap(y, z); + BOOST_MATH_GPU_SAFE_SWAP(y, z); BOOST_MATH_ASSERT(x >= z); BOOST_MATH_ASSERT(z >= y); @@ -64,7 +63,7 @@ namespace boost { namespace math { namespace detail{ else { // x = z, y != 0 - swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); return (x == 0) ? T(sqrt(z) / 2) : T((z * ellint_rc_imp(x, z, pol) + sqrt(x)) / 2); } } @@ -75,7 +74,7 @@ namespace boost { namespace math { namespace detail{ } else if(y == 0) { - swap(y, z); + BOOST_MATH_GPU_SAFE_SWAP(y, z); // // Special handling for common case, from // Numerical Computation of Real or Complex Elliptic Integrals, eq.46 @@ -106,7 +105,7 @@ namespace boost { namespace math { namespace detail{ } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rg(T1 x, T2 y, T3 z, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -119,7 +118,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rg(T1 x, T2 y, T3 z) { return ellint_rg(x, y, z, policies::policy<>()); diff --git a/include/boost/math/special_functions/ellint_rj.hpp b/include/boost/math/special_functions/ellint_rj.hpp index f19eac2843..76e1a14eb4 100644 --- a/include/boost/math/special_functions/ellint_rj.hpp +++ b/include/boost/math/special_functions/ellint_rj.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang, 2015 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -18,8 +19,9 @@ #pragma once #endif -#include #include +#include +#include #include #include #include @@ -32,7 +34,7 @@ namespace boost { namespace math { namespace detail{ template -T ellint_rc1p_imp(T y, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_rc1p_imp(T y, const Policy& pol) { using namespace boost::math; // Calculate RC(1, 1 + x) @@ -70,11 +72,11 @@ T ellint_rc1p_imp(T y, const Policy& pol) } template -T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ellint_rj_imp_final(T x, T y, T z, T p, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::ellint_rj<%1%>(%1%,%1%,%1%)"; + constexpr auto function = "boost::math::ellint_rj<%1%>(%1%,%1%,%1%)"; if(x < 0) { @@ -94,37 +96,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) } if(x + y == 0 || y + z == 0 || z + x == 0) { - return policies::raise_domain_error(function, "At most one argument can be zero, only possible result is %1%.", std::numeric_limits::quiet_NaN(), pol); - } - - // for p < 0, the integral is singular, return Cauchy principal value - if(p < 0) - { - // - // We must ensure that x < y < z. - // Since the integral is symmetrical in x, y and z - // we can just permute the values: - // - if(x > y) - std::swap(x, y); - if(y > z) - std::swap(y, z); - if(x > y) - std::swap(x, y); - - BOOST_MATH_ASSERT(x <= y); - BOOST_MATH_ASSERT(y <= z); - - T q = -p; - p = (z * (x + y + q) - x * y) / (z + q); - - BOOST_MATH_ASSERT(p >= 0); - - T value = (p - z) * ellint_rj_imp(x, y, z, p, pol); - value -= 3 * ellint_rf_imp(x, y, z, pol); - value += 3 * sqrt((x * y * z) / (x * y + p * q)) * ellint_rc_imp(T(x * y + p * q), T(p * q), pol); - value /= (z + q); - return value; + return policies::raise_domain_error(function, "At most one argument can be zero, only possible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); } // @@ -148,13 +120,12 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) else { // x = y only, permute so y = z: - using std::swap; - swap(x, z); + BOOST_MATH_GPU_SAFE_SWAP(x, z); if(y == p) { return ellint_rd_imp(x, y, y, pol); } - else if((std::max)(y, p) / (std::min)(y, p) > T(1.2)) + else if(BOOST_MATH_GPU_SAFE_MAX(y, p) / BOOST_MATH_GPU_SAFE_MIN(y, p) > T(1.2)) { return 3 * (ellint_rc_imp(x, y, pol) - ellint_rc_imp(x, p, pol)) / (p - y); } @@ -168,7 +139,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) // y = z = p: return ellint_rd_imp(x, y, y, pol); } - else if((std::max)(y, p) / (std::min)(y, p) > T(1.2)) + else if(BOOST_MATH_GPU_SAFE_MAX(y, p) / BOOST_MATH_GPU_SAFE_MIN(y, p) > T(1.2)) { // y = z: return 3 * (ellint_rc_imp(x, y, pol) - ellint_rc_imp(x, p, pol)) / (p - y); @@ -187,7 +158,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) T An = (x + y + z + 2 * p) / 5; T A0 = An; T delta = (p - x) * (p - y) * (p - z); - T Q = pow(tools::epsilon() / 5, -T(1) / 8) * (std::max)((std::max)(fabs(An - x), fabs(An - y)), (std::max)(fabs(An - z), fabs(An - p))); + T Q = pow(tools::epsilon() / 5, -T(1) / 8) * BOOST_MATH_GPU_SAFE_MAX(BOOST_MATH_GPU_SAFE_MAX(fabs(An - x), fabs(An - y)), BOOST_MATH_GPU_SAFE_MAX(fabs(An - z), fabs(An - p))); unsigned n; T lambda; @@ -260,10 +231,71 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) return result; } +template +BOOST_MATH_GPU_ENABLED T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol) +{ + BOOST_MATH_STD_USING + + constexpr auto function = "boost::math::ellint_rj<%1%>(%1%,%1%,%1%)"; + + if(x < 0) + { + return policies::raise_domain_error(function, "Argument x must be non-negative, but got x = %1%", x, pol); + } + if(y < 0) + { + return policies::raise_domain_error(function, "Argument y must be non-negative, but got y = %1%", y, pol); + } + if(z < 0) + { + return policies::raise_domain_error(function, "Argument z must be non-negative, but got z = %1%", z, pol); + } + if(p == 0) + { + return policies::raise_domain_error(function, "Argument p must not be zero, but got p = %1%", p, pol); + } + if(x + y == 0 || y + z == 0 || z + x == 0) + { + return policies::raise_domain_error(function, "At most one argument can be zero, only possible result is %1%.", boost::math::numeric_limits::quiet_NaN(), pol); + } + + // for p < 0, the integral is singular, return Cauchy principal value + if(p < 0) + { + // + // We must ensure that x < y < z. + // Since the integral is symmetrical in x, y and z + // we can just permute the values: + // + if(x > y) + BOOST_MATH_GPU_SAFE_SWAP(x, y); + if(y > z) + BOOST_MATH_GPU_SAFE_SWAP(y, z); + if(x > y) + BOOST_MATH_GPU_SAFE_SWAP(x, y); + + BOOST_MATH_ASSERT(x <= y); + BOOST_MATH_ASSERT(y <= z); + + T q = -p; + p = (z * (x + y + q) - x * y) / (z + q); + + BOOST_MATH_ASSERT(p >= 0); + + T value = (p - z) * ellint_rj_imp_final(x, y, z, p, pol); + value -= 3 * ellint_rf_imp(x, y, z, pol); + value += 3 * sqrt((x * y * z) / (x * y + p * q)) * ellint_rc_imp(T(x * y + p * q), T(p * q), pol); + value /= (z + q); + return value; + } + + return ellint_rj_imp_final(x, y, z, p, pol); +} + } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rj(T1 x, T2 y, T3 z, T4 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -278,7 +310,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ellint_rj(T1 x, T2 y, T3 z, T4 p) { return ellint_rj(x, y, z, p, policies::policy<>()); diff --git a/include/boost/math/special_functions/heuman_lambda.hpp b/include/boost/math/special_functions/heuman_lambda.hpp index 0fbf4a9803..05002725f2 100644 --- a/include/boost/math/special_functions/heuman_lambda.hpp +++ b/include/boost/math/special_functions/heuman_lambda.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2015 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,6 +11,9 @@ #pragma once #endif +#include +#include +#include #include #include #include @@ -26,13 +30,13 @@ namespace detail{ // Elliptic integral - Jacobi Zeta template -T heuman_lambda_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T heuman_lambda_imp(T phi, T k, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::tools; using namespace boost::math::constants; - const char* function = "boost::math::heuman_lambda<%1%>(%1%, %1%)"; + constexpr auto function = "boost::math::heuman_lambda<%1%>(%1%, %1%)"; if(fabs(k) > 1) return policies::raise_domain_error(function, "We require |k| <= 1 but got k = %1%", k, pol); @@ -51,10 +55,10 @@ T heuman_lambda_imp(T phi, T k, const Policy& pol) } else { - typedef std::integral_constant::value&& std::numeric_limits::digits && (std::numeric_limits::digits <= 54) ? 0 : - std::is_floating_point::value && std::numeric_limits::digits && (std::numeric_limits::digits <= 64) ? 1 : 2 - > precision_tag_type; + typedef boost::math::integral_constant::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 54) ? 0 : + boost::math::is_floating_point::value && boost::math::numeric_limits::digits && (boost::math::numeric_limits::digits <= 64) ? 1 : 2 + > precision_tag_type; T rkp = sqrt(kp); T ratio; @@ -63,7 +67,9 @@ T heuman_lambda_imp(T phi, T k, const Policy& pol) return policies::raise_domain_error(function, "When 1-k^2 == 1 then phi must be < Pi/2, but got phi = %1%", phi, pol); } else + { ratio = ellint_f_imp(phi, rkp, pol, k2) / ellint_k_imp(rkp, pol, k2); + } result = ratio + ellint_k_imp(k, pol, precision_tag_type()) * jacobi_zeta_imp(phi, rkp, pol, k2) / constants::half_pi(); } return result; @@ -72,7 +78,7 @@ T heuman_lambda_imp(T phi, T k, const Policy& pol) } // detail template -inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi, const Policy& pol) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -80,7 +86,7 @@ inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi, co } template -inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type heuman_lambda(T1 k, T2 phi) { return boost::math::heuman_lambda(k, phi, policies::policy<>()); } diff --git a/include/boost/math/special_functions/jacobi_zeta.hpp b/include/boost/math/special_functions/jacobi_zeta.hpp index c4ba7d23d2..8b6f80912d 100644 --- a/include/boost/math/special_functions/jacobi_zeta.hpp +++ b/include/boost/math/special_functions/jacobi_zeta.hpp @@ -11,6 +11,8 @@ #pragma once #endif +#include +#include #include #include #include @@ -27,7 +29,7 @@ namespace detail{ // Elliptic integral - Jacobi Zeta template -T jacobi_zeta_imp(T phi, T k, const Policy& pol, T kp) +BOOST_MATH_GPU_ENABLED T jacobi_zeta_imp(T phi, T k, const Policy& pol, T kp) { BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -55,14 +57,14 @@ T jacobi_zeta_imp(T phi, T k, const Policy& pol, T kp) return invert ? T(-result) : result; } template -inline T jacobi_zeta_imp(T phi, T k, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T jacobi_zeta_imp(T phi, T k, const Policy& pol) { return jacobi_zeta_imp(phi, k, pol, T(1 - k * k)); } } // detail template -inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi, const Policy& pol) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -70,7 +72,7 @@ inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi, cons } template -inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type jacobi_zeta(T1 k, T2 phi) { return boost::math::jacobi_zeta(k, phi, policies::policy<>()); } diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 22c3889499..35f8dd1bb9 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -26,6 +26,8 @@ #include #include // for argument promotion. +#include +#include #ifdef BOOST_MATH_HAS_NVRTC @@ -36,6 +38,20 @@ template BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type beta(RT1 a, RT2 b, A arg); +namespace detail{ + + template + struct ellint_3_result + { + using type = typename boost::math::conditional< + policies::is_policy::value, + tools::promote_args_t, + tools::promote_args_t + >::type; + }; + + } // namespace detail + } // namespace math } // namespace boost @@ -329,90 +345,90 @@ namespace boost // Elliptic integrals: template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rf(T1 x, T2 y, T3 z); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rf(T1 x, T2 y, T3 z, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rd(T1 x, T2 y, T3 z); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rd(T1 x, T2 y, T3 z, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rc(T1 x, T2 y); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rc(T1 x, T2 y, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rj(T1 x, T2 y, T3 z, T4 p); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rj(T1 x, T2 y, T3 z, T4 p, const Policy& pol); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rg(T1 x, T2 y, T3 z); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_rg(T1 x, T2 y, T3 z, const Policy& pol); template - tools::promote_args_t ellint_2(T k); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_2(T k); template - tools::promote_args_t ellint_2(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_2(T1 k, T2 phi); template - tools::promote_args_t ellint_2(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_2(T1 k, T2 phi, const Policy& pol); template - tools::promote_args_t ellint_1(T k); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_1(T k); template - tools::promote_args_t ellint_1(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_1(T1 k, T2 phi); template - tools::promote_args_t ellint_1(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_1(T1 k, T2 phi, const Policy& pol); template - tools::promote_args_t ellint_d(T k); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_d(T k); template - tools::promote_args_t ellint_d(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_d(T1 k, T2 phi); template - tools::promote_args_t ellint_d(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_d(T1 k, T2 phi, const Policy& pol); template - tools::promote_args_t jacobi_zeta(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t jacobi_zeta(T1 k, T2 phi); template - tools::promote_args_t jacobi_zeta(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t jacobi_zeta(T1 k, T2 phi, const Policy& pol); template - tools::promote_args_t heuman_lambda(T1 k, T2 phi); + BOOST_MATH_GPU_ENABLED tools::promote_args_t heuman_lambda(T1 k, T2 phi); template - tools::promote_args_t heuman_lambda(T1 k, T2 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t heuman_lambda(T1 k, T2 phi, const Policy& pol); namespace detail{ template struct ellint_3_result { - using type = typename std::conditional< + using type = typename boost::math::conditional< policies::is_policy::value, tools::promote_args_t, tools::promote_args_t @@ -423,13 +439,13 @@ namespace boost template - typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi); + BOOST_MATH_GPU_ENABLED typename detail::ellint_3_result::type ellint_3(T1 k, T2 v, T3 phi); template - tools::promote_args_t ellint_3(T1 k, T2 v, T3 phi, const Policy& pol); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_3(T1 k, T2 v, T3 phi, const Policy& pol); template - tools::promote_args_t ellint_3(T1 k, T2 v); + BOOST_MATH_GPU_ENABLED tools::promote_args_t ellint_3(T1 k, T2 v); // Factorial functions. // Note: not for integral types, at present. @@ -648,10 +664,10 @@ namespace boost tools::promote_args_t acosh(T x, const Policy&); template - tools::promote_args_t atanh(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t atanh(T x); template - tools::promote_args_t atanh(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t atanh(T x, const Policy&); namespace detail{ @@ -816,58 +832,58 @@ namespace boost std::complex >::result_type> sph_hankel_2(T1 v, T2 x); template - tools::promote_args_t airy_ai(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai(T x, const Policy&); template - tools::promote_args_t airy_ai(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai(T x); template - tools::promote_args_t airy_bi(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_bi(T x, const Policy&); template - tools::promote_args_t airy_bi(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_bi(T x); template - tools::promote_args_t airy_ai_prime(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai_prime(T x, const Policy&); template - tools::promote_args_t airy_ai_prime(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_ai_prime(T x); template - tools::promote_args_t airy_bi_prime(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_bi_prime(T x, const Policy&); template - tools::promote_args_t airy_bi_prime(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t airy_bi_prime(T x); template - T airy_ai_zero(int m); + BOOST_MATH_GPU_ENABLED T airy_ai_zero(int m); template - T airy_ai_zero(int m, const Policy&); + BOOST_MATH_GPU_ENABLED T airy_ai_zero(int m, const Policy&); template - OutputIterator airy_ai_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it); template - OutputIterator airy_ai_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it, const Policy&); template - T airy_bi_zero(int m); + BOOST_MATH_GPU_ENABLED T airy_bi_zero(int m); template - T airy_bi_zero(int m, const Policy&); + BOOST_MATH_GPU_ENABLED T airy_bi_zero(int m, const Policy&); template - OutputIterator airy_bi_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it); template - OutputIterator airy_bi_zero( + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero( int start_index, unsigned number_of_zeros, OutputIterator out_it, @@ -1365,54 +1381,54 @@ namespace boost spherical_harmonic_i(unsigned n, int m, T1 theta, T2 phi, const Policy& pol);\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rf(T1 x, T2 y, T3 z){ return ::boost::math::ellint_rf(x, y, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rd(T1 x, T2 y, T3 z){ return ::boost::math::ellint_rd(x, y, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rc(T1 x, T2 y){ return ::boost::math::ellint_rc(x, y, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rj(T1 x, T2 y, T3 z, T4 p){ return boost::math::ellint_rj(x, y, z, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ellint_rg(T1 x, T2 y, T3 z){ return ::boost::math::ellint_rg(x, y, z, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_2(T k){ return boost::math::ellint_2(k, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_2(T k){ return boost::math::ellint_2(k, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_2(T1 k, T2 phi){ return boost::math::ellint_2(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_2(T1 k, T2 phi){ return boost::math::ellint_2(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_d(T k){ return boost::math::ellint_d(k, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_d(T k){ return boost::math::ellint_d(k, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_d(T1 k, T2 phi){ return boost::math::ellint_d(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_d(T1 k, T2 phi){ return boost::math::ellint_d(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t jacobi_zeta(T1 k, T2 phi){ return boost::math::jacobi_zeta(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t jacobi_zeta(T1 k, T2 phi){ return boost::math::jacobi_zeta(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t heuman_lambda(T1 k, T2 phi){ return boost::math::heuman_lambda(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t heuman_lambda(T1 k, T2 phi){ return boost::math::heuman_lambda(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_1(T k){ return boost::math::ellint_1(k, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_1(T k){ return boost::math::ellint_1(k, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_1(T1 k, T2 phi){ return boost::math::ellint_1(k, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_1(T1 k, T2 phi){ return boost::math::ellint_1(k, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_3(T1 k, T2 v, T3 phi){ return boost::math::ellint_3(k, v, phi, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_3(T1 k, T2 v, T3 phi){ return boost::math::ellint_3(k, v, phi, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t ellint_3(T1 k, T2 v){ return boost::math::ellint_3(k, v, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t ellint_3(T1 k, T2 v){ return boost::math::ellint_3(k, v, Policy()); }\ \ using boost::math::max_factorial;\ template \ @@ -1515,7 +1531,7 @@ namespace boost inline boost::math::tools::promote_args_t acosh(const T x){ return boost::math::acosh(x, Policy()); }\ \ template\ - inline boost::math::tools::promote_args_t atanh(const T x){ return boost::math::atanh(x, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t atanh(const T x){ return boost::math::atanh(x, Policy()); }\ \ template \ inline typename boost::math::detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x)\ @@ -1769,33 +1785,33 @@ template \ { return boost::math::jacobi_theta4m1tau(z, q, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t airy_ai(T x)\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t airy_ai(T x)\ { return boost::math::airy_ai(x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t airy_bi(T x)\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t airy_bi(T x)\ { return boost::math::airy_bi(x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t airy_ai_prime(T x)\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t airy_ai_prime(T x)\ { return boost::math::airy_ai_prime(x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t airy_bi_prime(T x)\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t airy_bi_prime(T x)\ { return boost::math::airy_bi_prime(x, Policy()); }\ \ template \ - inline T airy_ai_zero(int m)\ + BOOST_MATH_GPU_ENABLED inline T airy_ai_zero(int m)\ { return boost::math::airy_ai_zero(m, Policy()); }\ template \ - OutputIterator airy_ai_zero(int start_index, unsigned number_of_zeros, OutputIterator out_it)\ + BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero(int start_index, unsigned number_of_zeros, OutputIterator out_it)\ { return boost::math::airy_ai_zero(start_index, number_of_zeros, out_it, Policy()); }\ \ template \ - inline T airy_bi_zero(int m)\ + BOOST_MATH_GPU_ENABLED inline T airy_bi_zero(int m)\ { return boost::math::airy_bi_zero(m, Policy()); }\ template \ - OutputIterator airy_bi_zero(int start_index, unsigned number_of_zeros, OutputIterator out_it)\ + BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero(int start_index, unsigned number_of_zeros, OutputIterator out_it)\ { return boost::math::airy_bi_zero(start_index, number_of_zeros, out_it, Policy()); }\ \ template \ diff --git a/include/boost/math/tools/workaround.hpp b/include/boost/math/tools/workaround.hpp index 9b15c4e930..7edd1c12aa 100644 --- a/include/boost/math/tools/workaround.hpp +++ b/include/boost/math/tools/workaround.hpp @@ -23,7 +23,7 @@ namespace boost{ namespace math{ namespace tools{ // std::fmod(1185.0L, 1.5L); // template -inline T fmod_workaround(T a, T b) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED inline T fmod_workaround(T a, T b) BOOST_MATH_NOEXCEPT(T) { BOOST_MATH_STD_USING return fmod(a, b); diff --git a/test/cuda_jamfile b/test/cuda_jamfile index b3e1992c63..0ba47dace3 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -244,6 +244,15 @@ run test_weibull_quan_double.cu ; run test_weibull_quan_float.cu ; # Special Functions +run test_airy_ai_double.cu ; +run test_airy_ai_float.cu ; +run test_airy_ai_prime_double.cu ; +run test_airy_ai_prime_float.cu ; +run test_airy_bi_double.cu ; +run test_airy_bi_float.cu ; +run test_airy_bi_prime_double.cu ; +run test_airy_bi_prime_float.cu ; + run test_beta_double.cu ; run test_beta_float.cu ; run test_betac_double.cu ; @@ -308,6 +317,19 @@ run test_cos_pi_float.cu ; run test_digamma_double.cu ; run test_digamma_float.cu ; +run test_ellint_1_double.cu ; +run test_ellint_1_float.cu ; +run test_ellint_2_double.cu ; +run test_ellint_2_float.cu ; +run test_ellint_3_double.cu ; +run test_ellint_3_float.cu ; +run test_ellint_d_double.cu ; +run test_ellint_d_float.cu ; +run test_jacobi_zeta_double.cu ; +run test_jacobi_zeta_float.cu ; +run test_heuman_lambda_double.cu ; +run test_heuman_lambda_float.cu ; + run test_erf_double.cu ; run test_erf_float.cu ; run test_erf_inv_double.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 46dd0e257f..eb5da37c83 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -9,6 +9,9 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; +run test_heumann_lambda_nvrtc_double.cpp ; +run test_heumann_lambda_nvrtc_float.cpp ; + # Quad run test_exp_sinh_quad_nvrtc_float.cpp ; run test_exp_sinh_quad_nvrtc_double.cpp ; @@ -241,6 +244,15 @@ run test_weibull_quan_nvrtc_double.cpp ; run test_weibull_quan_nvrtc_float.cpp ; # Special Functions +run test_airy_ai_nvrtc_double.cpp ; +run test_airy_ai_nvrtc_float.cpp ; +run test_airy_ai_prime_nvrtc_double.cpp ; +run test_airy_ai_prime_nvrtc_float.cpp ; +run test_airy_bi_nvrtc_double.cpp ; +run test_airy_bi_nvrtc_float.cpp ; +run test_airy_bi_prime_nvrtc_double.cpp ; +run test_airy_bi_prime_nvrtc_float.cpp ; + run test_beta_nvrtc_double.cpp ; run test_beta_nvrtc_float.cpp ; run test_betac_nvrtc_double.cpp ; @@ -313,6 +325,17 @@ run test_erf_inv_nvrtc_float.cpp ; run test_erfc_inv_nvrtc_double.cpp ; run test_erfc_inv_nvrtc_float.cpp ; +run test_ellint_1_nvrtc_double.cpp ; +run test_ellint_1_nvrtc_float.cpp ; +run test_ellint_2_nvrtc_double.cpp ; +run test_ellint_2_nvrtc_float.cpp ; +run test_ellint_3_nvrtc_double.cpp ; +run test_ellint_3_nvrtc_float.cpp ; +run test_ellint_d_nvrtc_double.cpp ; +run test_ellint_d_nvrtc_float.cpp ; +run test_jacobi_zeta_nvrtc_double.cpp ; +run test_jacobi_zeta_nvrtc_float.cpp ; + run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 1626031082..ff6d84fde5 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -46,6 +46,8 @@ run test_weibull.cpp ; # Special Functions run pow_test.cpp ; +run test_airy.cpp ; + run test_beta_simple.cpp ; run test_beta.cpp ; run test_ibeta.cpp ; @@ -59,6 +61,12 @@ run test_bessel_y.cpp ; run test_cbrt.cpp ; +run test_ellint_1.cpp ; +run test_ellint_2.cpp ; +run test_ellint_d.cpp ; +run test_jacobi_zeta.cpp ; +run test_heuman_lambda.cpp ; + run test_sign.cpp ; run test_round.cpp ; diff --git a/test/test_airy.cpp b/test/test_airy.cpp index d42fbb4ca3..548d4de159 100644 --- a/test/test_airy.cpp +++ b/test/test_airy.cpp @@ -3,14 +3,21 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error #define BOOST_TEST_MAIN #include #include #include +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #include #include #include @@ -50,6 +57,11 @@ void test_airy(T, const char* name) T tol = boost::math::tools::epsilon() * 800; if ((std::numeric_limits::digits > 100) || (std::numeric_limits::digits == 0)) tol *= 2; + + #ifdef SYCL_LANGUAGE_VERSION + tol *= 5; + #endif + for(unsigned i = 0; i < data.size(); ++i) { BOOST_CHECK_CLOSE_FRACTION(data[i][1], boost::math::airy_ai(data[i][0]), tol); diff --git a/test/test_airy_ai_double.cu b/test/test_airy_ai_double.cu new file mode 100644 index 0000000000..fad46bd9d5 --- /dev/null +++ b/test/test_airy_ai_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_ai(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_ai(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_ai_float.cu b/test/test_airy_ai_float.cu new file mode 100644 index 0000000000..b9149aec39 --- /dev/null +++ b/test/test_airy_ai_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_ai(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_ai(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_ai_nvrtc_double.cpp b/test/test_airy_ai_nvrtc_double.cpp new file mode 100644 index 0000000000..1b918cfef2 --- /dev/null +++ b/test/test_airy_ai_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_airy_ai_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_ai(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_ai_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_ai_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_ai_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_ai(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_ai_nvrtc_float.cpp b/test/test_airy_ai_nvrtc_float.cpp new file mode 100644 index 0000000000..6957306426 --- /dev/null +++ b/test/test_airy_ai_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_airy_ai_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_ai(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_ai_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_ai_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_ai_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_ai(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_ai_prime_double.cu b/test/test_airy_ai_prime_double.cu new file mode 100644 index 0000000000..1a6bcd7104 --- /dev/null +++ b/test/test_airy_ai_prime_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_ai_prime(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_ai_prime(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_ai_prime_float.cu b/test/test_airy_ai_prime_float.cu new file mode 100644 index 0000000000..df690c2b10 --- /dev/null +++ b/test/test_airy_ai_prime_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_ai_prime(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_ai_prime(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_ai_prime_nvrtc_double.cpp b/test/test_airy_ai_prime_nvrtc_double.cpp new file mode 100644 index 0000000000..1012571761 --- /dev/null +++ b/test/test_airy_ai_prime_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_airy_ai_prime_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_ai_prime(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_ai_prime_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_ai_prime_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_ai_prime_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_ai_prime(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_ai_prime_nvrtc_float.cpp b/test/test_airy_ai_prime_nvrtc_float.cpp new file mode 100644 index 0000000000..c96e044497 --- /dev/null +++ b/test/test_airy_ai_prime_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_airy_ai_prime_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_ai_prime(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_ai_prime_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_ai_prime_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_ai_prime_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_ai_prime(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_bi_double.cu b/test/test_airy_bi_double.cu new file mode 100644 index 0000000000..60001a3fe5 --- /dev/null +++ b/test/test_airy_bi_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_bi(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_bi(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_bi_float.cu b/test/test_airy_bi_float.cu new file mode 100644 index 0000000000..ed729bfe78 --- /dev/null +++ b/test/test_airy_bi_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_bi(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_bi(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_bi_nvrtc_double.cpp b/test/test_airy_bi_nvrtc_double.cpp new file mode 100644 index 0000000000..f69e239163 --- /dev/null +++ b/test/test_airy_bi_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_airy_bi_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_bi(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_bi_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_bi_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_bi_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_bi(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_bi_nvrtc_float.cpp b/test/test_airy_bi_nvrtc_float.cpp new file mode 100644 index 0000000000..c28a5f5eb0 --- /dev/null +++ b/test/test_airy_bi_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_airy_bi_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_bi(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_bi_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_bi_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_bi_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_bi(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_bi_prime_double.cu b/test/test_airy_bi_prime_double.cu new file mode 100644 index 0000000000..a73e43f254 --- /dev/null +++ b/test/test_airy_bi_prime_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_bi_prime(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_bi_prime(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_bi_prime_float.cu b/test/test_airy_bi_prime_float.cu new file mode 100644 index 0000000000..36874bccc7 --- /dev/null +++ b/test/test_airy_bi_prime_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::airy_bi_prime(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::airy_bi_prime(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_airy_bi_prime_nvrtc_double.cpp b/test/test_airy_bi_prime_nvrtc_double.cpp new file mode 100644 index 0000000000..802f63a292 --- /dev/null +++ b/test/test_airy_bi_prime_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_airy_bi_prime_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_bi_prime(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_bi_prime_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_bi_prime_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_bi_prime_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_bi_prime(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_airy_bi_prime_nvrtc_float.cpp b/test/test_airy_bi_prime_nvrtc_float.cpp new file mode 100644 index 0000000000..e96aa48b97 --- /dev/null +++ b/test/test_airy_bi_prime_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_airy_bi_prime_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::airy_bi_prime(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_airy_bi_prime_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_airy_bi_prime_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_airy_bi_prime_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::airy_bi_prime(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_1.cpp b/test/test_ellint_1.cpp index b5cb2a359e..db82132a5a 100644 --- a/test/test_ellint_1.cpp +++ b/test/test_ellint_1.cpp @@ -6,7 +6,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_ellint_1.hpp" // diff --git a/test/test_ellint_1.hpp b/test/test_ellint_1.hpp index 635bcf2293..786841302c 100644 --- a/test/test_ellint_1.hpp +++ b/test/test_ellint_1.hpp @@ -9,11 +9,15 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif #define BOOST_TEST_MAIN #include #include #include +#include #include #include "functor.hpp" @@ -139,11 +143,13 @@ void test_spots(T, const char* type_name) // // Test error handling: // + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_GE(boost::math::ellint_1(T(1)), boost::math::tools::max_value()); BOOST_CHECK_GE(boost::math::ellint_1(T(-1)), boost::math::tools::max_value()); BOOST_CHECK_THROW(boost::math::ellint_1(T(1.0001)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_1(T(-1.0001)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_1(T(2.2), T(0.5)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_1(T(-2.2), T(0.5)), std::domain_error); + #endif } diff --git a/test/test_ellint_1_double.cu b/test/test_ellint_1_double.cu new file mode 100644 index 0000000000..eb9bfb162d --- /dev/null +++ b/test/test_ellint_1_double.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_1_float.cu b/test/test_ellint_1_float.cu new file mode 100644 index 0000000000..8de959d225 --- /dev/null +++ b/test/test_ellint_1_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_1_nvrtc_double.cpp b/test/test_ellint_1_nvrtc_double.cpp new file mode 100644 index 0000000000..fac5da55f0 --- /dev/null +++ b/test/test_ellint_1_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_1_nvrtc_float.cpp b/test/test_ellint_1_nvrtc_float.cpp new file mode 100644 index 0000000000..fac5da55f0 --- /dev/null +++ b/test/test_ellint_1_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_2.cpp b/test/test_ellint_2.cpp index ca3e994d4d..0da012c133 100644 --- a/test/test_ellint_2.cpp +++ b/test/test_ellint_2.cpp @@ -6,7 +6,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_ellint_2.hpp" // @@ -72,7 +75,11 @@ void expected_results() ".*", // platform largest_type, // test type(s) ".*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + ".*", 20, 6); // test function + #else ".*", 15, 6); // test function + #endif add_expected_result( ".*", // compiler ".*", // stdlib diff --git a/test/test_ellint_2.hpp b/test/test_ellint_2.hpp index e38f94d984..29a73c9961 100644 --- a/test/test_ellint_2.hpp +++ b/test/test_ellint_2.hpp @@ -9,11 +9,18 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include +#include #include #include "functor.hpp" @@ -157,10 +164,12 @@ void test_spots(T, const char* type_name) // // Test error handling: // + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_EQUAL(boost::math::ellint_2(T(1)), T(1)); BOOST_CHECK_EQUAL(boost::math::ellint_2(T(-1)), T(1)); BOOST_CHECK_THROW(boost::math::ellint_2(T(1.5)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_2(T(-1.5)), std::domain_error); BOOST_CHECK_THROW(boost::math::ellint_2(T(1.5), T(1.5)), std::domain_error); + #endif } diff --git a/test/test_ellint_2_double.cu b/test/test_ellint_2_double.cu new file mode 100644 index 0000000000..2e1073576e --- /dev/null +++ b/test/test_ellint_2_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_2(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_2(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_2_float.cu b/test/test_ellint_2_float.cu new file mode 100644 index 0000000000..a55a6d1ad4 --- /dev/null +++ b/test/test_ellint_2_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_2(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_2(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_2_nvrtc_double.cpp b/test/test_ellint_2_nvrtc_double.cpp new file mode 100644 index 0000000000..dd2eef1547 --- /dev/null +++ b/test/test_ellint_2_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_2_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_2(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_2(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_2_nvrtc_float.cpp b/test/test_ellint_2_nvrtc_float.cpp new file mode 100644 index 0000000000..dd2eef1547 --- /dev/null +++ b/test/test_ellint_2_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_2_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_2(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_2_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_2_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_2_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_2(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_3_double.cu b/test/test_ellint_3_double.cu new file mode 100644 index 0000000000..979e01ff18 --- /dev/null +++ b/test/test_ellint_3_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_3_float.cu b/test/test_ellint_3_float.cu new file mode 100644 index 0000000000..979e01ff18 --- /dev/null +++ b/test/test_ellint_3_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_3_nvrtc_double.cpp b/test/test_ellint_3_nvrtc_double.cpp new file mode 100644 index 0000000000..dacab66192 --- /dev/null +++ b/test/test_ellint_3_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_3_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_3_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_3_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_3_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_3(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_3_nvrtc_float.cpp b/test/test_ellint_3_nvrtc_float.cpp new file mode 100644 index 0000000000..72b2ec71e7 --- /dev/null +++ b/test/test_ellint_3_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ellint_3_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_3_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_3_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_3_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_3(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_d.cpp b/test/test_ellint_d.cpp index 5e76a49fb6..420bc0c022 100644 --- a/test/test_ellint_d.cpp +++ b/test/test_ellint_d.cpp @@ -4,7 +4,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_ellint_d.hpp" // diff --git a/test/test_ellint_d.hpp b/test/test_ellint_d.hpp index de53936f1f..c33a4d942a 100644 --- a/test/test_ellint_d.hpp +++ b/test/test_ellint_d.hpp @@ -8,11 +8,17 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include #include #include "functor.hpp" @@ -117,6 +123,7 @@ void test_spots(T, const char* type_name) do_test_ellint_d1(ellint_d_data, type_name, "Elliptic Integral D: Random Data"); + #ifdef BOOST_MATH_NO_EXCEPTIONS BOOST_MATH_CHECK_THROW(boost::math::ellint_d(T(1)), std::domain_error); BOOST_MATH_CHECK_THROW(boost::math::ellint_d(T(-1)), std::domain_error); BOOST_MATH_CHECK_THROW(boost::math::ellint_d(T(1.5)), std::domain_error); @@ -126,5 +133,6 @@ void test_spots(T, const char* type_name) BOOST_CHECK_EQUAL(boost::math::ellint_d(T(0.5), std::numeric_limits::infinity()), std::numeric_limits::infinity()); } BOOST_MATH_CHECK_THROW(boost::math::ellint_d(T(1.5), T(1.0)), std::domain_error); + #endif } diff --git a/test/test_ellint_d_double.cu b/test/test_ellint_d_double.cu new file mode 100644 index 0000000000..979e01ff18 --- /dev/null +++ b/test/test_ellint_d_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_d_float.cu b/test/test_ellint_d_float.cu new file mode 100644 index 0000000000..50882aa76a --- /dev/null +++ b/test/test_ellint_d_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ellint_3(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_ellint_d_nvrtc_double.cpp b/test/test_ellint_d_nvrtc_double.cpp new file mode 100644 index 0000000000..cb65a2e731 --- /dev/null +++ b/test/test_ellint_d_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ellint_d_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_d(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_d_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_d_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_d_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_d(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ellint_d_nvrtc_float.cpp b/test/test_ellint_d_nvrtc_float.cpp new file mode 100644 index 0000000000..727d9dcd17 --- /dev/null +++ b/test/test_ellint_d_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ellint_d_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ellint_d(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ellint_d_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ellint_d_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ellint_d_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ellint_d(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_heuman_lambda.cpp b/test/test_heuman_lambda.cpp index 83709c635b..cdcf39aa68 100644 --- a/test/test_heuman_lambda.cpp +++ b/test/test_heuman_lambda.cpp @@ -4,7 +4,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_heuman_lambda.hpp" // diff --git a/test/test_heuman_lambda.hpp b/test/test_heuman_lambda.hpp index 23720b2d02..6081dac482 100644 --- a/test/test_heuman_lambda.hpp +++ b/test/test_heuman_lambda.hpp @@ -8,11 +8,17 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include #include #include #include "functor.hpp" diff --git a/test/test_heuman_lambda_double.cu b/test/test_heuman_lambda_double.cu new file mode 100644 index 0000000000..361dbe8051 --- /dev/null +++ b/test/test_heuman_lambda_double.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::heuman_lambda(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::heuman_lambda(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_heuman_lambda_float.cu b/test/test_heuman_lambda_float.cu new file mode 100644 index 0000000000..361dbe8051 --- /dev/null +++ b/test/test_heuman_lambda_float.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::heuman_lambda(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::heuman_lambda(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_heumann_lambda_nvrtc_double.cpp b/test/test_heumann_lambda_nvrtc_double.cpp new file mode 100644 index 0000000000..38c762fd51 --- /dev/null +++ b/test/test_heumann_lambda_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_heuman_lambda_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::heuman_lambda(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_heuman_lambda_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_heuman_lambda_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_heuman_lambda_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::heuman_lambda(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_heumann_lambda_nvrtc_float.cpp b/test/test_heumann_lambda_nvrtc_float.cpp new file mode 100644 index 0000000000..5139b9d6f6 --- /dev/null +++ b/test/test_heumann_lambda_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_heuman_lambda_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::heuman_lambda(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_heuman_lambda_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_heuman_lambda_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_heuman_lambda_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::heuman_lambda(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_jacobi_zeta.cpp b/test/test_jacobi_zeta.cpp index 77f33efb1e..c64f99580e 100644 --- a/test/test_jacobi_zeta.cpp +++ b/test/test_jacobi_zeta.cpp @@ -4,7 +4,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_jacobi_zeta.hpp" // diff --git a/test/test_jacobi_zeta.hpp b/test/test_jacobi_zeta.hpp index 1aa72feb0d..a39d3ba709 100644 --- a/test/test_jacobi_zeta.hpp +++ b/test/test_jacobi_zeta.hpp @@ -8,11 +8,17 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include +#endif + #define BOOST_TEST_MAIN #include #include #include +#include #include //#include #include diff --git a/test/test_jacobi_zeta_double.cu b/test/test_jacobi_zeta_double.cu new file mode 100644 index 0000000000..8594da140b --- /dev/null +++ b/test/test_jacobi_zeta_double.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::jacobi_zeta(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::jacobi_zeta(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_jacobi_zeta_float.cu b/test/test_jacobi_zeta_float.cu new file mode 100644 index 0000000000..7b473455ad --- /dev/null +++ b/test/test_jacobi_zeta_float.cu @@ -0,0 +1,120 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::jacobi_zeta(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::jacobi_zeta(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + int fail_counter = 0; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200) + { + std::cerr << "Result verification failed at element " << i << "!\n" + << "Device: " << output_vector[i] << '\n' + << " Host: " << results[i] << '\n' + << " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + fail_counter++; + if (fail_counter > 100) + { + break; + } + } + } + } + + if (fail_counter > 0) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_jacobi_zeta_nvrtc_double.cpp b/test/test_jacobi_zeta_nvrtc_double.cpp new file mode 100644 index 0000000000..ded2e66571 --- /dev/null +++ b/test/test_jacobi_zeta_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_jacobi_zeta_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::jacobi_zeta(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_jacobi_zeta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_jacobi_zeta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_jacobi_zeta_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::jacobi_zeta(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_jacobi_zeta_nvrtc_float.cpp b/test/test_jacobi_zeta_nvrtc_float.cpp new file mode 100644 index 0000000000..de52da118d --- /dev/null +++ b/test/test_jacobi_zeta_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_jacobi_zeta_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::jacobi_zeta(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_jacobi_zeta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_jacobi_zeta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_jacobi_zeta_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::jacobi_zeta(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From f985ba209eedcb29893305f1c82ea4bbb23ecdfa Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 15:12:40 -0400 Subject: [PATCH 2/3] Fix windows cases of overflow error --- test/test_ellint_1.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_ellint_1.cpp b/test/test_ellint_1.cpp index db82132a5a..9366e4545a 100644 --- a/test/test_ellint_1.cpp +++ b/test/test_ellint_1.cpp @@ -10,6 +10,10 @@ #include #endif +#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#endif + #include "test_ellint_1.hpp" // From dbb547b5ac051b698331af342511a005e9b9505c Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 16 Sep 2024 16:33:48 -0400 Subject: [PATCH 3/3] Fix CUDA only macro for NVRTC --- include/boost/math/tools/config.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 6d7af4d2be..99bc4ee27e 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -789,6 +789,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return #define BOOST_MATH_HAS_GPU_SUPPORT #define BOOST_MATH_GPU_ENABLED __host__ __device__ +#define BOOST_MATH_CUDA_ENABLED __host__ __device__ #define BOOST_MATH_STATIC static #define BOOST_MATH_STATIC_LOCAL_VARIABLE