From 9ffad9bb2099692aaefdabc5240044cfa6058bc6 Mon Sep 17 00:00:00 2001
From: smokestacklightnin
 <125844868+smokestacklightnin@users.noreply.github.com>
Date: Fri, 18 Aug 2023 12:36:57 -0700
Subject: [PATCH] Add CUDA JIT to `calc_n_effective()` (#119)

* Add basic test for `calc_doppler_width()`

* Refactor `calc_doppler_width()` and add test for vectorized implementation

* Typecast to float

* Add unwrapped cuda implementation of doppler_width

Also typecast all global constants to float

* Add wrapped cuda implementation of calc_doppler_width

* Return cupy array by default

* Add test for `calc_n_effective()`

* Typecast inputs and refactor

* Add cuda implementations and tests for `calc_n_effective()`

* Return cupy array by default
---
 stardis/opacities/broadening.py            | 140 ++++++++++--
 stardis/opacities/tests/test_broadening.py | 238 +++++++++++++++++++++
 2 files changed, 366 insertions(+), 12 deletions(-)
 create mode 100644 stardis/opacities/tests/test_broadening.py

diff --git a/stardis/opacities/broadening.py b/stardis/opacities/broadening.py
index 71abda1b..0fd17a0a 100644
--- a/stardis/opacities/broadening.py
+++ b/stardis/opacities/broadening.py
@@ -1,19 +1,26 @@
 import numpy as np
 from astropy import constants as const
+import math
 import numba
+from numba import cuda
 
+GPUs_available = cuda.is_available()
 
-SPEED_OF_LIGHT = const.c.cgs.value
-BOLTZMANN_CONSTANT = const.k_B.cgs.value
-PLANCK_CONSTANT = const.h.cgs.value
-RYDBERG_ENERGY = (const.h.cgs * const.c.cgs * const.Ryd.cgs).value
-ELEMENTARY_CHARGE = const.e.esu.value
-BOHR_RADIUS = const.a0.cgs.value
-VACUUM_ELECTRIC_PERMITTIVITY = 1 / (4 * np.pi)
+if GPUs_available:
+    import cupy as cp
+
+PI = float(np.pi)
+SPEED_OF_LIGHT = float(const.c.cgs.value)
+BOLTZMANN_CONSTANT = float(const.k_B.cgs.value)
+PLANCK_CONSTANT = float(const.h.cgs.value)
+RYDBERG_ENERGY = float((const.h.cgs * const.c.cgs * const.Ryd.cgs).value)
+ELEMENTARY_CHARGE = float(const.e.esu.value)
+BOHR_RADIUS = float(const.a0.cgs.value)
+VACUUM_ELECTRIC_PERMITTIVITY = 1.0 / (4.0 * PI)
 
 
 @numba.njit
-def calc_doppler_width(nu_line, temperature, atomic_mass):
+def _calc_doppler_width(nu_line, temperature, atomic_mass):
     """
     Calculates doppler width.
     https://ui.adsabs.harvard.edu/abs/2003rtsa.book.....R/
@@ -31,15 +38,66 @@ def calc_doppler_width(nu_line, temperature, atomic_mass):
     -------
     float
     """
+    nu_line, temperature, atomic_mass = (
+        float(nu_line),
+        float(temperature),
+        float(atomic_mass),
+    )
+
     return (
         nu_line
         / SPEED_OF_LIGHT
-        * np.sqrt(2 * BOLTZMANN_CONSTANT * temperature / atomic_mass)
+        * math.sqrt(2.0 * BOLTZMANN_CONSTANT * temperature / atomic_mass)
     )
 
 
+@numba.vectorize(nopython=True)
+def calc_doppler_width(nu_line, temperature, atomic_mass):
+    return _calc_doppler_width(nu_line, temperature, atomic_mass)
+
+
+@cuda.jit
+def _calc_doppler_width_cuda(res, nu_line, temperature, atomic_mass):
+    tid = cuda.grid(1)
+    size = len(res)
+
+    if tid < size:
+        res[tid] = _calc_doppler_width(nu_line[tid], temperature[tid], atomic_mass[tid])
+
+
+def calc_doppler_width_cuda(
+    nu_line,
+    temperature,
+    atomic_mass,
+    nthreads=256,
+    ret_np_ndarray=False,
+    dtype=float,
+):
+    arg_list = (
+        nu_line,
+        temperature,
+        atomic_mass,
+    )
+
+    shortest_arg_idx = np.argmin(map(len, arg_list))
+    size = len(arg_list[shortest_arg_idx])
+
+    nblocks = 1 + (size // nthreads)
+
+    arg_list = tuple(map(lambda v: cp.array(v, dtype=dtype), arg_list))
+
+    res = cp.empty_like(arg_list[shortest_arg_idx], dtype=dtype)
+
+    _calc_doppler_width_cuda[nblocks, nthreads](
+        res,
+        *arg_list,
+    )
+
+    return cp.asnumpy(res) if ret_np_ndarray else res
+
+
 @numba.njit
-def calc_n_effective(ion_number, ionization_energy, level_energy):
+def _calc_n_effective(ion_number, ionization_energy, level_energy):
     """
     Calculates the effective principal quantum number of an energy level.
 
@@ -56,7 +114,65 @@ def calc_n_effective(ion_number, ionization_energy, level_energy):
     -------
     float
     """
-    return np.sqrt(RYDBERG_ENERGY / (ionization_energy - level_energy)) * ion_number
+    ion_number, ionization_energy, level_energy = (
+        int(ion_number),
+        float(ionization_energy),
+        float(level_energy),
+    )
+    return math.sqrt(RYDBERG_ENERGY / (ionization_energy - level_energy)) * ion_number
+
+
+@numba.vectorize(nopython=True)
+def calc_n_effective(ion_number, ionization_energy, level_energy):
+    return _calc_n_effective(
+        ion_number,
+        ionization_energy,
+        level_energy,
+    )
+
+
+@cuda.jit
+def _calc_n_effective_cuda(res, ion_number, ionization_energy, level_energy):
+    tid = cuda.grid(1)
+    size = len(res)
+
+    if tid < size:
+        res[tid] = _calc_n_effective(
+            ion_number[tid],
+            ionization_energy[tid],
+            level_energy[tid],
+        )
+
+
+def calc_n_effective_cuda(
+    ion_number,
+    ionization_energy,
+    level_energy,
+    nthreads=256,
+    ret_np_ndarray=False,
+    dtype=float,
+):
+    arg_list = (
+        ion_number,
+        ionization_energy,
+        level_energy,
+    )
+
+    shortest_arg_idx = np.argmin(map(len, arg_list))
+    size = len(arg_list[shortest_arg_idx])
+
+    nblocks = 1 + (size // nthreads)
+
+    arg_list = tuple(map(lambda v: cp.array(v, dtype=dtype), arg_list))
+
+    res = cp.empty_like(arg_list[shortest_arg_idx], dtype=dtype)
+
+    _calc_n_effective_cuda[nblocks, nthreads](
+        res,
+        *arg_list,
+    )
+
+    return cp.asnumpy(res) if ret_np_ndarray else res
 
 
 @numba.njit
@@ -186,7 +302,7 @@ def calc_gamma_van_der_waals(
 
     gamma_van_der_waals = (
         17
-        * (8 * BOLTZMANN_CONSTANT * temperature / (np.pi * h_mass)) ** 0.3
+        * (8 * BOLTZMANN_CONSTANT * temperature / (PI * h_mass)) ** 0.3
         * c6**0.4
         * h_density
     )
diff --git a/stardis/opacities/tests/test_broadening.py b/stardis/opacities/tests/test_broadening.py
new file mode 100644
index 00000000..be63407c
--- /dev/null
+++ b/stardis/opacities/tests/test_broadening.py
@@ -0,0 +1,238 @@
+import pytest
+import numpy as np
+from astropy import constants as const
+from numba import cuda
+
+from stardis.opacities.broadening import (
+    calc_doppler_width,
+    _calc_doppler_width_cuda,
+    calc_doppler_width_cuda,
+    calc_n_effective,
+    _calc_n_effective_cuda,
+    calc_n_effective_cuda,
+)
+
+GPUs_available = cuda.is_available()
+
+if GPUs_available:
+    import cupy as cp
+
+
+PI = np.pi
+SPEED_OF_LIGHT = const.c.cgs.value
+BOLTZMANN_CONSTANT = const.k_B.cgs.value
+PLANCK_CONSTANT = const.h.cgs.value
+RYDBERG_ENERGY = (const.h.cgs * const.c.cgs * const.Ryd.cgs).value
+ELEMENTARY_CHARGE = const.e.esu.value
+BOHR_RADIUS = const.a0.cgs.value
+VACUUM_ELECTRIC_PERMITTIVITY = 1 / (4 * PI)
+
+
+@pytest.mark.parametrize(
+    "calc_doppler_width_sample_values_input_nu_line,calc_doppler_width_sample_values_input_temperature,calc_doppler_width_sample_values_input_atomic_mass, calc_doppler_width_sample_values_expected_result",
+    [
+        (
+            SPEED_OF_LIGHT,
+            0.5,
+            BOLTZMANN_CONSTANT,
+            1.0,
+        ),
+        (
+            np.array(2 * [SPEED_OF_LIGHT]),
+            np.array(2 * [0.5]),
+            np.array(2 * [BOLTZMANN_CONSTANT]),
+            np.array(2 * [1.0]),
+        ),
+    ],
+)
+def test_calc_doppler_width_sample_values(
+    calc_doppler_width_sample_values_input_nu_line,
+    calc_doppler_width_sample_values_input_temperature,
+    calc_doppler_width_sample_values_input_atomic_mass,
+    calc_doppler_width_sample_values_expected_result,
+):
+    assert np.allclose(
+        calc_doppler_width(
+            calc_doppler_width_sample_values_input_nu_line,
+            calc_doppler_width_sample_values_input_temperature,
+            calc_doppler_width_sample_values_input_atomic_mass,
+        ),
+        calc_doppler_width_sample_values_expected_result,
+    )
+
+
+@pytest.mark.skipif(
+    not GPUs_available, reason="No GPU is available to test CUDA function"
+)
+@pytest.mark.parametrize(
+    "calc_doppler_width_cuda_unwrapped_sample_values_input_nu_line,calc_doppler_width_cuda_unwrapped_sample_values_input_temperature,calc_doppler_width_cuda_unwrapped_sample_values_input_atomic_mass,calc_doppler_width_cuda_unwrapped_sample_values_expected_result",
+    [
+        (
+            np.array(2 * [SPEED_OF_LIGHT]),
+            np.array(2 * [0.5]),
+            np.array(2 * [BOLTZMANN_CONSTANT]),
+            np.array(2 * [1.0]),
+        ),
+    ],
+)
+def test_calc_doppler_width_cuda_unwrapped_sample_values(
+    calc_doppler_width_cuda_unwrapped_sample_values_input_nu_line,
+    calc_doppler_width_cuda_unwrapped_sample_values_input_temperature,
+    calc_doppler_width_cuda_unwrapped_sample_values_input_atomic_mass,
+    calc_doppler_width_cuda_unwrapped_sample_values_expected_result,
+):
+    arg_list = (
+        calc_doppler_width_cuda_unwrapped_sample_values_input_nu_line,
+        calc_doppler_width_cuda_unwrapped_sample_values_input_temperature,
+        calc_doppler_width_cuda_unwrapped_sample_values_input_atomic_mass,
+    )
+
+    arg_list = tuple(map(cp.array, arg_list))
+    result_values = cp.empty_like(arg_list[0])
+
+    nthreads = 256
+    length = len(calc_doppler_width_cuda_unwrapped_sample_values_expected_result)
+    nblocks = 1 + (length // nthreads)
+
+    _calc_doppler_width_cuda[nblocks, nthreads](result_values, *arg_list)
+
+    assert np.allclose(
+        cp.asnumpy(result_values),
+        calc_doppler_width_cuda_unwrapped_sample_values_expected_result,
+    )
+
+
+@pytest.mark.skipif(
+    not GPUs_available, reason="No GPU is available to test CUDA function"
+)
+@pytest.mark.parametrize(
+    "calc_doppler_width_cuda_sample_values_input_nu_line, calc_doppler_width_cuda_sample_values_input_temperature, calc_doppler_width_cuda_sample_values_input_atomic_mass, calc_doppler_width_cuda_wrapped_sample_cuda_values_expected_result",
+    [
+        (
+            np.array(2 * [SPEED_OF_LIGHT]),
+            np.array(2 * [0.5]),
+            np.array(2 * [BOLTZMANN_CONSTANT]),
+            np.array(2 * [1.0]),
+        ),
+    ],
+)
+def test_calc_doppler_width_cuda_wrapped_sample_cuda_values(
+    calc_doppler_width_cuda_sample_values_input_nu_line,
+    calc_doppler_width_cuda_sample_values_input_temperature,
+    calc_doppler_width_cuda_sample_values_input_atomic_mass,
+    calc_doppler_width_cuda_wrapped_sample_cuda_values_expected_result,
+):
+    arg_list = (
+        calc_doppler_width_cuda_sample_values_input_nu_line,
+        calc_doppler_width_cuda_sample_values_input_temperature,
+        calc_doppler_width_cuda_sample_values_input_atomic_mass,
+    )
+    assert np.allclose(
+        calc_doppler_width_cuda(*map(cp.asarray, arg_list)),
+        calc_doppler_width_cuda_wrapped_sample_cuda_values_expected_result,
+    )
+
+
+@pytest.mark.parametrize(
+    "calc_n_effective_sample_values_input_ion_number,calc_n_effective_sample_values_input_ionization_energy,calc_n_effective_sample_values_input_level_energy, calc_n_effective_sample_values_expected_result",
+    [
+        (
+            1.0,
+            RYDBERG_ENERGY,
+            0,
+            1.0,
+        ),
+        (
+            np.array(2 * [1]),
+            np.array(2 * [RYDBERG_ENERGY]),
+            np.array(2 * [0]),
+            np.array(2 * [1.0]),
+        ),
+    ],
+)
+def test_calc_n_effective_sample_values(
+    calc_n_effective_sample_values_input_ion_number,
+    calc_n_effective_sample_values_input_ionization_energy,
+    calc_n_effective_sample_values_input_level_energy,
+    calc_n_effective_sample_values_expected_result,
+):
+    assert np.allclose(
+        calc_n_effective(
+            calc_n_effective_sample_values_input_ion_number,
+            calc_n_effective_sample_values_input_ionization_energy,
+            calc_n_effective_sample_values_input_level_energy,
+        ),
+        calc_n_effective_sample_values_expected_result,
+    )
+
+
+@pytest.mark.skipif(
+    not GPUs_available, reason="No GPU is available to test CUDA function"
+)
+@pytest.mark.parametrize(
+    "calc_n_effective_cuda_unwrapped_sample_values_input_ion_number,calc_n_effective_cuda_unwrapped_sample_values_input_ionization_energy,calc_n_effective_cuda_unwrapped_sample_values_input_level_energy,calc_n_effective_cuda_unwrapped_sample_values_expected_result",
+    [
+        (
+            np.array(2 * [1]),
+            np.array(2 * [RYDBERG_ENERGY]),
+            np.array(2 * [0]),
+            np.array(2 * [1.0]),
+        ),
+    ],
+)
+def test_calc_n_effective_cuda_unwrapped_sample_values(
+    calc_n_effective_cuda_unwrapped_sample_values_input_ion_number,
+    calc_n_effective_cuda_unwrapped_sample_values_input_ionization_energy,
+    calc_n_effective_cuda_unwrapped_sample_values_input_level_energy,
+    calc_n_effective_cuda_unwrapped_sample_values_expected_result,
+):
+    arg_list = (
+        calc_n_effective_cuda_unwrapped_sample_values_input_ion_number,
+        calc_n_effective_cuda_unwrapped_sample_values_input_ionization_energy,
+        calc_n_effective_cuda_unwrapped_sample_values_input_level_energy,
+    )
+
+    arg_list = tuple(map(cp.array, arg_list))
+    result_values = cp.empty_like(arg_list[0])
+
+    nthreads = 256
+    length = len(calc_n_effective_cuda_unwrapped_sample_values_expected_result)
+    nblocks = 1 + (length // nthreads)
+
+    _calc_n_effective_cuda[nblocks, nthreads](result_values, *arg_list)
+
+    assert np.allclose(
+        cp.asnumpy(result_values),
+        calc_n_effective_cuda_unwrapped_sample_values_expected_result,
+    )
+
+
+@pytest.mark.skipif(
+    not GPUs_available, reason="No GPU is available to test CUDA function"
+)
+@pytest.mark.parametrize(
+    "calc_n_effective_cuda_sample_values_input_ion_number, calc_n_effective_cuda_sample_values_input_ionization_energy, calc_n_effective_cuda_sample_values_input_level_energy, calc_n_effective_cuda_wrapped_sample_cuda_values_expected_result",
+    [
+        (
+            np.array(2 * [1]),
+            np.array(2 * [RYDBERG_ENERGY]),
+            np.array(2 * [0]),
+            np.array(2 * [1.0]),
+        ),
+    ],
+)
+def test_calc_n_effective_cuda_wrapped_sample_cuda_values(
+    calc_n_effective_cuda_sample_values_input_ion_number,
+    calc_n_effective_cuda_sample_values_input_ionization_energy,
+    calc_n_effective_cuda_sample_values_input_level_energy,
+    calc_n_effective_cuda_wrapped_sample_cuda_values_expected_result,
+):
+    arg_list = (
+        calc_n_effective_cuda_sample_values_input_ion_number,
+        calc_n_effective_cuda_sample_values_input_ionization_energy,
+        calc_n_effective_cuda_sample_values_input_level_energy,
+    )
+    assert np.allclose(
+        calc_n_effective_cuda(*map(cp.asarray, arg_list)),
+        calc_n_effective_cuda_wrapped_sample_cuda_values_expected_result,
+    )