diff --git a/stardis/opacities/tests/test_voigt.py b/stardis/opacities/tests/test_voigt.py index 2e2940f6..69ef5465 100644 --- a/stardis/opacities/tests/test_voigt.py +++ b/stardis/opacities/tests/test_voigt.py @@ -1,8 +1,20 @@ import pytest -from numpy import allclose, pi as PI +import numpy as np from math import sqrt +from numba import cuda -from stardis.opacities.voigt import faddeeva, voigt_profile +from stardis.opacities.voigt import ( + faddeeva, + _faddeeva_cuda, + faddeeva_cuda, + voigt_profile, +) + +# Test cases must also take into account use of a GPU to run. If there is no GPU then the test cases will fail. +GPUs_available = cuda.is_available() + +if GPUs_available: + import cupy as cp @pytest.mark.parametrize( @@ -10,17 +22,90 @@ [ (0, 1 + 0j), (0.0, 1.0 + 0.0j), + (np.array([0.0]), np.array([1.0 + 0.0j])), + (np.array([0, 0]), np.array([1 + 0j, 1 + 0j])), ], ) def test_faddeeva_sample_values( faddeeva_sample_values_input, faddeeva_sample_values_expected_result ): - assert allclose( + assert np.allclose( faddeeva(faddeeva_sample_values_input), faddeeva_sample_values_expected_result, ) +@pytest.mark.skipif( + not GPUs_available, reason="No GPU is available to test CUDA function" +) +@pytest.mark.parametrize( + "faddeeva_cuda_unwrapped_sample_values_input, faddeeva_cuda_unwrapped_sample_values_expected_result", + [ + (np.array([0.0], dtype=complex), np.array([1.0 + 0.0j])), + (np.array([0, 0], dtype=complex), np.array([1 + 0j, 1 + 0j])), + ], +) +def test_faddeeva_cuda_unwrapped_sample_values( + faddeeva_cuda_unwrapped_sample_values_input, + faddeeva_cuda_unwrapped_sample_values_expected_result, +): + test_values = cp.asarray(faddeeva_cuda_unwrapped_sample_values_input) + result_values = cp.empty_like(test_values) + + nthreads = 256 + length = len(faddeeva_cuda_unwrapped_sample_values_input) + nblocks = 1 + (length // nthreads) + + _faddeeva_cuda[nblocks, nthreads](result_values, test_values) + + assert np.allclose( + cp.asnumpy(result_values), + faddeeva_cuda_unwrapped_sample_values_expected_result, + ) + + +@pytest.mark.skipif( + not GPUs_available, reason="No GPU is available to test CUDA function" +) +@pytest.mark.parametrize( + "faddeeva_cuda_wrapped_sample_numpy_values_input, faddeeva_cuda_wrapped_sample_numpy_values_expected_result", + [ + (np.array([0.0]), np.array([1.0 + 0.0j])), + (np.array([0, 0]), np.array([1 + 0j, 1 + 0j])), + ], +) +def test_faddeeva_cuda_wrapped_sample_numpy_values( + faddeeva_cuda_wrapped_sample_numpy_values_input, + faddeeva_cuda_wrapped_sample_numpy_values_expected_result, +): + assert np.allclose( + faddeeva_cuda(faddeeva_cuda_wrapped_sample_numpy_values_input), + faddeeva_cuda_wrapped_sample_numpy_values_expected_result, + ) + + +@pytest.mark.skipif( + not GPUs_available, reason="No GPU is available to test CUDA function" +) +@pytest.mark.parametrize( + "faddeeva_cuda_wrapped_sample_cuda_values_input, faddeeva_cuda_wrapped_sample_cuda_values_expected_result", + [ + ( + np.array([0, 0], dtype=complex), + np.array([1 + 0j, 1 + 0j]), + ), + ], +) +def test_faddeeva_cuda_wrapped_sample_cuda_values( + faddeeva_cuda_wrapped_sample_cuda_values_input, + faddeeva_cuda_wrapped_sample_cuda_values_expected_result, +): + assert np.allclose( + faddeeva_cuda(cp.asarray(faddeeva_cuda_wrapped_sample_cuda_values_input)), + faddeeva_cuda_wrapped_sample_cuda_values_expected_result, + ) + + test_voigt_profile_division_by_zero_test_values = [ -100, -5, @@ -57,8 +142,8 @@ def test_voigt_profile_division_by_zero( @pytest.mark.parametrize( "voigt_profile_sample_values_input_delta_nu, voigt_profile_sample_values_input_doppler_width, voigt_profile_sample_values_input_gamma, voigt_profile_sample_values_expected_result", [ - (0, 1, 0, 1 / sqrt(PI)), - (0, 2, 0, 1 / (sqrt(PI) * 2)), + (0, 1, 0, 1 / sqrt(np.pi)), + (0, 2, 0, 1 / (sqrt(np.pi) * 2)), ], ) def test_voigt_profile_sample_values_sample_values( @@ -67,7 +152,7 @@ def test_voigt_profile_sample_values_sample_values( voigt_profile_sample_values_input_gamma, voigt_profile_sample_values_expected_result, ): - assert allclose( + assert np.allclose( voigt_profile( voigt_profile_sample_values_input_delta_nu, voigt_profile_sample_values_input_doppler_width, diff --git a/stardis/opacities/voigt.py b/stardis/opacities/voigt.py index ba15d356..d7bd77ae 100644 --- a/stardis/opacities/voigt.py +++ b/stardis/opacities/voigt.py @@ -1,9 +1,18 @@ import numpy as np import numba +from numba import cuda +import cmath +GPUs_available = cuda.is_available() -@numba.njit -def faddeeva(z): +if GPUs_available: + import cupy as cp + +SQRT_PI = np.sqrt(np.pi, dtype=float) + + +@numba.njit() +def _faddeeva(z): """ The Faddeeva function. Code adapted from https://github.com/tiagopereira/Transparency.jl/blob/966fb46c21/src/voigt.jl#L13. @@ -16,63 +25,88 @@ def faddeeva(z): ------- w : complex """ - s = abs(np.real(z)) + np.imag(z) - - if s > 15.0: - # region I - w = 1j * 1 / np.sqrt(np.pi) * z / (z**2 - 0.5) - - elif s > 5.5: - # region II - w = ( - 1j - * (z * (z**2 * 1 / np.sqrt(np.pi) - 1.4104739589)) - / (0.75 + z**2 * (z**2 - 3.0)) + z = complex(z) + x = float(z.real) + y = float(z.imag) + t = y - 1j * x + s = abs(x) + y + w = complex(0.0) + u = t * t + + IN_REGION_I = s > 15.0 + IN_REGION_II = (not IN_REGION_I) and (s > 5.5) + IN_REGION_III = ( + (not IN_REGION_I) and (not IN_REGION_II) and (y >= 0.195 * abs(x) - 0.176) + ) + IN_REGION_IV = (not IN_REGION_I) and (not IN_REGION_II) and (not IN_REGION_III) + + # If in Region I + w = 1j * 1 / SQRT_PI * z / (z**2 - 0.5) if IN_REGION_I else w + + # If in Region II + w = ( + 1j + * (z * (z**2 * 1 / SQRT_PI - 1.4104739589)) + / (0.75 + z**2 * (z**2 - 3.0)) + if IN_REGION_II + else w + ) + + # If in Region III + w = ( + (16.4955 + t * (20.20933 + t * (11.96482 + t * (3.778987 + 0.5642236 * t)))) + / ( + 16.4955 + + t * (38.82363 + t * (39.27121 + t * (21.69274 + t * (6.699398 + t)))) ) + if IN_REGION_III + else w + ) - else: - x = np.real(z) - y = np.imag(z) - t = y - 1j * x - - if y >= 0.195 * abs(x) - 0.176: - # region III - w = ( - 16.4955 - + t * (20.20933 + t * (11.96482 + t * (3.778987 + 0.5642236 * t))) - ) / ( - 16.4955 - + t * (38.82363 + t * (39.27121 + t * (21.69274 + t * (6.699398 + t)))) - ) - - else: - # region IV - u = t * t - numerator = t * ( - 36183.31 - - u - * ( - 3321.99 - - u - * ( - 1540.787 - - u * (219.031 - u * (35.7668 - u * (1.320522 - u * 0.56419))) - ) - ) - ) - denominantor = 32066.6 - u * ( - 24322.8 - - u - * ( - 9022.23 - - u * (2186.18 - u * (364.219 - u * (61.5704 - u * (1.84144 - u)))) - ) - ) - w = np.exp(u) - numerator / denominantor + # If in Region IV + numerator = t * ( + 36183.31 + - u + * ( + 3321.99 + - u + * (1540.787 - u * (219.031 - u * (35.7668 - u * (1.320522 - u * 0.56419)))) + ) + ) + denominator = 32066.6 - u * ( + 24322.8 + - u + * (9022.23 - u * (2186.18 - u * (364.219 - u * (61.5704 - u * (1.84144 - u))))) + ) + w = (cmath.exp(u) - numerator / denominator) if IN_REGION_IV else w return w +@numba.vectorize(nopython=True) +def faddeeva(z): + return _faddeeva(z) + + +@cuda.jit +def _faddeeva_cuda(res, z): + tid = cuda.grid(1) + size = len(res) + + if tid < size: + res[tid] = _faddeeva(z[tid]) + + +def faddeeva_cuda(z, nthreads=256, ret_np_ndarray=False): + size = len(z) + nblocks = 1 + (size // nthreads) + z = cp.asarray(z, dtype=complex) + res = cp.empty_like(z) + + _faddeeva_cuda[nblocks, nthreads](res, z) + return cp.asnumpy(res) if ret_np_ndarray else res + + @numba.njit def voigt_profile(delta_nu, doppler_width, gamma): """ @@ -95,5 +129,5 @@ def voigt_profile(delta_nu, doppler_width, gamma): Value of Voigt profile. """ z = (delta_nu + (gamma / (4 * np.pi)) * 1j) / doppler_width - phi = np.real(faddeeva(z)) / (np.sqrt(np.pi) * doppler_width) + phi = np.real(faddeeva(z)) / (SQRT_PI * doppler_width) return phi