From 423ae8df633181e1bd36ed79958228474357ae85 Mon Sep 17 00:00:00 2001
From: Marshall Perrin <mperrin@stsci.edu>
Date: Fri, 24 Mar 2023 16:34:53 -0400
Subject: [PATCH 1/3] remove older CUDA implementation, now replaced by the
 CuPy implementation

---
 poppy/accel_math.py     | 135 +++++-----------------------------------
 poppy/tests/test_fft.py |  41 ------------
 2 files changed, 17 insertions(+), 159 deletions(-)

diff --git a/poppy/accel_math.py b/poppy/accel_math.py
index c91d224f..5e9fadee 100644
--- a/poppy/accel_math.py
+++ b/poppy/accel_math.py
@@ -1,6 +1,6 @@
 # accel_math.py
 #
-# Various functions related to accelerated computations using FFTW, CUDA, numexpr, and related.
+# Various functions related to accelerated computations using FFTW, CUPY, numexpr, and related.
 #
 import numpy as np
 import scipy
@@ -40,16 +40,6 @@
     _MKLFFT_AVAILABLE = False
 
 
-try:
-    # try to import CUDA packages to see if they are available
-    import pyculib
-    from numba import cuda
-    _CUDA_PLANS = {} # plans for various array sizes already prepared
-    _CUDA_AVAILABLE = True
-except ImportError:
-    pyculib = None
-    _CUDA_AVAILABLE = False
-
 try:
     # try to import OpenCL packages to see if they are is available
     import pyopencl
@@ -75,7 +65,6 @@
     _CUPY_AVAILABLE = False    
 
 _USE_CUPY = (conf.use_cupy and _CUPY_AVAILABLE)
-_USE_CUDA = (conf.use_cuda and _CUDA_AVAILABLE)
 _USE_OPENCL = (conf.use_opencl and _OPENCL_AVAILABLE)
 _USE_NUMEXPR = (conf.use_numexpr and _NUMEXPR_AVAILABLE and not _USE_CUPY)
 _USE_FFTW = (conf.use_fftw and _FFTW_AVAILABLE)
@@ -87,11 +76,10 @@
 def update_math_settings():
     """ Update the module-level math flags, based on user settings
     """
-    global _USE_CUPY, _USE_CUDA, _USE_OPENCL, _USE_NUMEXPR, _USE_FFTW, _USE_MKL
+    global _USE_CUPY, _USE_OPENCL, _USE_NUMEXPR, _USE_FFTW, _USE_MKL
     global xp, _scipy
 
     _USE_CUPY = (conf.use_cupy and _CUPY_AVAILABLE)
-    _USE_CUDA = (conf.use_cuda and _CUDA_AVAILABLE)
     _USE_OPENCL = (conf.use_opencl and _OPENCL_AVAILABLE)
     _USE_NUMEXPR = (conf.use_numexpr and _NUMEXPR_AVAILABLE and not _USE_CUPY)
     _USE_FFTW = (conf.use_fftw and _FFTW_AVAILABLE)
@@ -144,27 +132,17 @@ def _exp(x):
         return np.exp(x)
 
 def _fftshift(x):
-    """ FFT shifts of array contents, using CUDA if available.
+    """ FFT shifts of array contents, using CUDA/CuPY if available.
     Otherwise defaults to numpy.
 
     Note - TODO write an OpenCL version
+    Note 2 - This used to be an interesting function with a clever CUDA implementation
+             of fftshift, which has since been obsoleted by cupy's implementation.
+             This wrapper function could be refactored out in the future
 
     See also ifftshift
     """
-
-    N=x.shape[0]
-    # the CUDA fftshift is set up to work on blocks of 32, so
-    # N must be a multiple of 32. We check this rapidly using a bit mask:
-    #    (x & 31)==0  is a ~20x faster equivalent of (np.mod(x,32)==0)
-    if (_USE_CUDA) & (N==x.shape[1]) & ((N & 31)==0):
-        blockdim = (32, 32) # threads per block
-        numBlocks = (int(N/blockdim[0]),int(N/blockdim[1]))
-        cufftShift_2D_kernel[numBlocks, blockdim](x.ravel(),N)
-        return x
-    elif _USE_CUPY:
-        return cp.fft.fftshift(x)
-    else:
-        return np.fft.fftshift(x)
+    return xp.fft.fftshift(x)
 
 def _ifftshift(x):
     """ Inverse FFT shifts of array contents, using CUDA if available.
@@ -175,23 +153,14 @@ def _ifftshift(x):
     so we can use the same algorithm as fftshift.
 
     Note - TODO write an OpenCL version
+    Note 2 - This used to be an interesting function with a clever CUDA implementation
+             of fftshift, which has since been obsoleted by cupy's implementation.
+             This wrapper function could be refactored out in the future
 
     See also fftshift
     """
 
-    N=x.shape[0]
-    # the CUDA fftshift is set up to work on blocks of 32, so
-    # N must be a multiple of 32. We check this rapidly using a bit mask:
-    #   not (x & 31)  is a ~20x faster equivalent of (np.mod(x,32)==0)
-    if (_USE_CUDA) & (N==x.shape[1]) & ((N & 31)==0):
-        blockdim = (32, 32) # threads per block
-        numBlocks = (int(N/blockdim[0]),int(N/blockdim[1]))
-        cufftShift_2D_kernel[numBlocks, blockdim](x.ravel(),N)
-        return x
-    elif _USE_CUPY:
-        return cp.fft.ifftshift(x)
-    else:
-        return np.fft.ifftshift(x)
+    return xp.fft.ifftshift(x)
 
 
 
@@ -199,7 +168,7 @@ def _ifftshift(x):
 def fft_2d(wavefront, forward=True, normalization=None, fftshift=True):
     """ main entry point for FFTs, used in Wavefront._propagate_fft and
     elsewhere. This will invoke one of the following, depending on availability:
-        - CUDA on NVidia GPU
+        - CUDA CuPy on NVidia GPU
         - OpenCL on AMD GPU
         - FFTW on CPU
         - numpy on CPU
@@ -229,7 +198,7 @@ def fft_2d(wavefront, forward=True, normalization=None, fftshift=True):
 
     """
     ## To use a fast FFT, it must both be enabled and the library itself has to be present
-    global _USE_OPENCL, _USE_CUDA, _USE_CUPY # need to declare global in case we need to change it, below
+    global _USE_OPENCL, _USE_CUPY # need to declare global in case we need to change it, below
     t0 = time.time()
 
     # OpenCL cfFFT only can FFT certain array sizes.
@@ -243,8 +212,6 @@ def fft_2d(wavefront, forward=True, normalization=None, fftshift=True):
         method = 'cupy (GPU)'
     elif _USE_OPENCL:
         method = 'pyopencl (OpenCL GPU)'
-    elif _USE_CUDA:
-        method = 'pyculib (CUDA GPU)'
     elif _USE_MKL:
         method = 'mkl_fft'
     elif _USE_FFTW:
@@ -263,27 +230,7 @@ def fft_2d(wavefront, forward=True, normalization=None, fftshift=True):
         wavefront = _ifftshift(wavefront)
 
     t1 = time.time()
-    if _USE_CUDA:
-        if normalization is None:
-            normalization = 1./wavefront.shape[0]  # regardless of direction, for CUDA
-
-        # We need a CUDA FFT plan for each size and shape of FFT.
-        # The plans can be cached for reuse, since they cost some
-        # 10s of milliseconds to create
-        params = (wavefront.shape, wavefront.dtype, wavefront.dtype)
-        try:
-            cufftplan = _CUDA_PLANS[params]
-        except KeyError:
-            cufftplan = pyculib.fft.FFTPlan(*params)
-            _CUDA_PLANS[params] = cufftplan
-
-        # perform FFT on GPU, and return results in place to same array.
-        if forward:
-            cufftplan.forward(wavefront, out=wavefront)
-        else:
-            cufftplan.inverse(wavefront, out=wavefront)
-
-    elif _USE_OPENCL:
+    if _USE_OPENCL:
         if normalization is None:
             normalization = 1./wavefront.shape[0] if forward else wavefront.shape[0]
 
@@ -294,13 +241,13 @@ def fft_2d(wavefront, forward=True, normalization=None, fftshift=True):
         event.wait()
         wavefront[:] = wf_on_gpu.get()
         del wf_on_gpu
-        
-    if _USE_CUPY:
+
+    elif _USE_CUPY:
         do_fft = cp.fft.fft2 if forward else cp.fft.ifft2
         if normalization is None:
             normalization = 1./wavefront.shape[0] if forward else wavefront.shape[0]
         wavefront = do_fft(wavefront)
-    
+
     elif _USE_MKL:
         # Intel MKL is a drop-in replacement for numpy fft but much faster
         do_fft = mkl_fft.fft2 if forward else mkl_fft.ifft2
@@ -398,42 +345,6 @@ def get_opencl_context():
         return (_OPENCL_STATE['context'], _OPENCL_STATE['queue'])
 
 
-
-
-
-if  _CUDA_AVAILABLE:
-    @cuda.jit()
-    def cufftShift_2D_kernel(data, N):
-        """
-        adopted CUDA FFT shift code from:
-        https://github.com/marwan-abdellah/cufftShift
-        (GNU Lesser Public License)
-        """
-
-        # // 2D Slice & 1D Line
-        sLine = N
-        sSlice = N * N
-        # // Transformations Equations
-        sEq1 = int((sSlice + sLine) / 2)
-        sEq2 = int((sSlice - sLine) / 2)
-        x, y = cuda.grid(2)
-        # // Thread Index Converted into 1D Index
-        index = (y * N) + x
-
-        if x < N / 2:
-            if y < N / 2:
-                # // First Quad
-                temp = data[index]
-                data[index] = data[index + sEq1]
-                # // Third Quad
-                data[index + sEq1] = temp
-        else:
-            if y < N / 2:
-                # // Second Quad
-                temp = data[index]
-                data[index] = data[index + sEq2]
-                data[index + sEq2] = temp
-
 # ##################################################################
 #
 #     Performance benchmarks
@@ -498,20 +409,9 @@ def benchmark_fft(npix=2048, iterations=20, double_precision=True):
     else:
         time_mkl = np.NaN
 
-    if poppy.accel_math._CUDA_AVAILABLE:
-        print("Timing performance with CUDA:")
-        poppy.conf.use_cuda = True
-        poppy.conf.use_opencl = False
-        update_math_settings()
-        time_cuda = timer.timeit(number=iterations) / iterations
-        print("  {:.3f} s".format(time_cuda))
-    else:
-        time_cuda = np.NaN
-
     if poppy.accel_math._OPENCL_AVAILABLE:
         print("Timing performance with OpenCL:")
         poppy.conf.use_opencl = True
-        poppy.conf.use_cuda = False
         update_math_settings()
         time_opencl = timer.timeit(number=iterations) / iterations
         print("  {:.3f} s".format(time_opencl))
@@ -526,7 +426,6 @@ def benchmark_fft(npix=2048, iterations=20, double_precision=True):
             'fftw': time_fftw,
             'numexpr': time_numexpr,
             'mkl': time_mkl,
-            'cupy': time_cuda,
             'opencl': time_opencl}
 
 
diff --git a/poppy/tests/test_fft.py b/poppy/tests/test_fft.py
index b8a8f03b..446a6bcc 100644
--- a/poppy/tests/test_fft.py
+++ b/poppy/tests/test_fft.py
@@ -172,7 +172,6 @@ def test_parity_FFT_forward_inverse(display=False):
 # Test different FFT algorithms for consistency:
 #    - numpy
 #    - fftw
-#    - CUDA / pyculib
 #    - OpenCL / clfft / gpyfft
 #
 # The test method is the same for all: calculate PSFs for an
@@ -235,46 +234,6 @@ def test_pyfftw_vs_numpyfft(verbose=False):
     conf.use_fftw, conf.use_cuda, conf.use_opencl, conf.use_cupy = defaults
 
 
-@pytest.mark.skipif(accel_math._CUDA_AVAILABLE is False, reason="CUDA not available")
-def test_cuda_vs_numpyfft(verbose=False):
-    """ Create an optical system with 2 parity test apertures,
-    propagate light through it, and compare that we get the same results from both numpy and CUDA"""
-
-    defaults = conf.use_fftw, conf.use_cuda, conf.use_opencl
-
-    conf.use_fftw = False
-    conf.use_opencl = False
-
-    sys = setup_test_osys()
-
-    conf.use_cuda = False
-    psf_numpy, intermediates_numpy = sys.calc_psf(wavelength=1e-6, return_intermediates=True)
-
-    conf.use_cuda = True
-    psf_cuda, intermediates_cuda = sys.calc_psf(wavelength=1e-6, return_intermediates=True)
-
-    # check the final PSFs are consistent
-    assert np.abs(psf_cuda[0].data-psf_numpy[0].data).max() < 1e-6
-
-    # Check flux conservation for the intermediate arrays behaves properly
-    intermediates = intermediates_cuda
-    epsilon = np.finfo(intermediates[0].wavefront.dtype).eps
-    total_int_input = intermediates_numpy[0].total_intensity
-    for i in [1,2]:
-        assert np.abs(intermediates[i].total_intensity - total_int_input) < 5*epsilon
-
-    # Check flux in output array is about 0.5% less than input array (due to finite FOV)
-    expected = 0.004949550538272617927759
-    assert np.abs(intermediates[3].total_intensity - total_int_input) - expected < 5*epsilon
-
-    if verbose:
-        print ("PSF difference: ", np.abs(psf_cuda[0].data-psf_numpy[0].data).max())
-        for i in [1,2]:
-            print(" Int. WF {} intensity diff: {}".format(i, np.abs(intermediates[i].total_intensity-total_int_input)) )
-        print(" Final PSF intensity diff:", np.abs(intermediates[3].total_intensity-total_int_input) - expected)
-
-    conf.use_fftw, conf.use_cuda, conf.use_opencl = defaults
-
 @pytest.mark.skipif(accel_math._OPENCL_AVAILABLE is False, reason="OPENCL not available")
 def test_opencl_vs_numpyfft(verbose=False):
     """ Create an optical system with 2 parity test apertures,

From aac38985778d5b2c9690f82ae915de01695f98e6 Mon Sep 17 00:00:00 2001
From: Marshall Perrin <mperrin@stsci.edu>
Date: Fri, 17 Feb 2023 10:18:43 -0700
Subject: [PATCH 2/3] Add initial GPU docs, and start updating release notes
 including for GPU code

---
 docs/gpu_acceleration.rst | 78 +++++++++++++++++++++++++++++++++++++++
 docs/index.rst            |  1 +
 docs/relnotes.rst         | 18 ++++++---
 3 files changed, 92 insertions(+), 5 deletions(-)
 create mode 100644 docs/gpu_acceleration.rst

diff --git a/docs/gpu_acceleration.rst b/docs/gpu_acceleration.rst
new file mode 100644
index 00000000..20440e45
--- /dev/null
+++ b/docs/gpu_acceleration.rst
@@ -0,0 +1,78 @@
+GPU Accelerated Optical Calculations
+====================================
+
+
+
+.. admonition:: Placeholder docs
+   This page is a placeholder for more complete documentation to be added later about usage of GPUs for fast optical calculations.
+
+
+
+Thanks to team members Kian Milani and Ewan Douglas (University of Arizona), poppy now includes a
+high performance option using NVidia GPUs to significantly accelerate optical calculations, in some
+cases by ~20x to 80x. 
+
+This implementation seeks to perform all calculations on the GPU until the end of propagation. This
+reduces time for calculations as arrays no longer need to be transferred between GPU memory and
+standard memory when performing different calculations. It also allows GPU acceleration of the
+majority of calculations performed during an optical propagation (i.e. creating models of optical
+elements and applying them to wavefronts happens on the GPU, as well as the propagation calculations
+from one plane to another.)
+
+An updated implementation using `CuPy <https://docs.cupy.dev/en/stable/overview.html>` replaces
+initial earlier support for CUDA using pyculib and numba.cuda. (That initial implementation has been
+removed since the CuPy implementation is much better performing.)
+
+Note, because cupy is used as a replacement for numpy at import time, it is a bit tricky to toggle
+between GPU and CPU calculations during the same python session. Doing so is advanced usage, and
+while it can be useful in some cases for debugging or benchmarking, it's not fully supported or
+recommended to try to switch between calculation backends within the same session. 
+
+
+**What about AMD GPUs?**
+
+There also exists partial/earlier support for OpenCL for AMD GPUs, using the `pyopencl` and `gpyfft`
+packages. This provides much less performance gains than the CuPy version, however, since only 
+FFTs are performed on-GPU, not other parts of the optical propagation calculations.
+
+**What about Apple Silicon GPUs?**
+
+Poppy does not yet have support for the specialized GPU hardware in Apple Silicon M1/M2 and similar.
+For these machines, plain numpy is the best option.
+
+Requirements and Setup
+----------------------
+
+
+Requires NVidia GPU hardware
+
+Requires CuPy > 10.0. Install from https://cupy.dev following the `CuPy installation docs <https://docs.cupy.dev/en/stable/install.html#>`_
+
+Also requires the cupyx GPU-accelerated version of scipy.
+
+
+Performance Comparisons
+-----------------------
+
+
+
+Computation comparisons have been performed to illustrate the benefit of this accelerated computing
+feature. Below are comparisons of the times required for a PSF to be calculated for varying array
+sizes using the MKL FFT option versus the CuPy calculations. The optical systems tested had 5
+different surfaces/optics. 
+
+Performances will naturally vary depending on the compute hardware used. The system used for these
+comparisons was the University of Arizona’s HPC Puma nodes. The node utilized 32 AMD EPYC 7642 CPUs
+and the NVIDIA Tesla V100S GPU.
+
++-------------------+--------------+------------------------+-------------------------+----------------------+
+|  Propagation Type |	Array Size |	MKL Method Times [s] |	CuPy Method Times [s] |	Speed Up Factor      |
++===============+=======+=======+===============+=======+
+| Fraunhofer	| 1024	| 0.218	| 0.0261	| 8.35  |
+| Fraunhofer	| 2048	| 0.755	| 0.0294	| 25.7  |
+| Fraunhofer	| 4096	| 3.36	| 0.0423	| 79.4  |
+| Fresnel	| 1024	| 0.714	| 0.0438	| 16.3  |
+| Fresnel	| 2048	| 4.16	| 0.0845	| 49.2  |
+| Fresnel	| 4096	| 17.5	| 0.225	        | 77.8  |
++---------------+-------+-------+---------------+-------+
+
diff --git a/docs/index.rst b/docs/index.rst
index d1e727e7..18bcaced 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -67,6 +67,7 @@ Contents
   about.rst
   performance.rst
   fft_optimization.rst
+  gpu_acceleration.rst
   dev_notes.rst
 
 
diff --git a/docs/relnotes.rst b/docs/relnotes.rst
index 04598c5d..3880ea63 100644
--- a/docs/relnotes.rst
+++ b/docs/relnotes.rst
@@ -5,15 +5,23 @@ Release Notes
 
 For a list of contributors, see :ref:`about`.
 
- 1.1.0
- -----
+.. _rel1.1.0:
 
- .. _rel1.1.0::
+1.1.0
+-----
+
+This release introduces support for much faster (20-80x) optical calculations using GPU acceleration via the CuPy library for NVidia GPUs. Credit to Kian Milani (:user:`kian1337`) for this significant and complex improvement. 
+
+
+*2023 date TBD*
 
- *2023 date TBD*
+ **Performance Enhancements:**
+  * Implement CuPy GPU acceleration for calculations throughout poppy. (PR :pr:`499` by :user:`kian1377` and :user:`mperrin`)
 
  **Software Infrastructure Updates and Internals:**
-  * Increase all CI test versions by 1, removing Python 3.8 and adding Python 3.11. Update minimum supported versions of astropy and numpy as well. (PR TBD by :user:`mperrin`)
+  * Increase all CI test versions by 1, removing Python 3.8 and adding Python 3.11. Update minimum supported versions of astropy and numpy as well. (PR :pr:`551` by :user:`BradleySappington`)
+  * Updates to dependencies (various PRs)
+  * Bug fix for a recent change in numpy interface (:pr:`545` by :user:`adambolton`)
 
 
 1.0.3

From e4eac9ba1ccec0de55d7aa1c5cc520970725f701 Mon Sep 17 00:00:00 2001
From: Marshall Perrin <mperrin@stsci.edu>
Date: Tue, 9 May 2023 17:28:33 -0400
Subject: [PATCH 3/3] remove unnecessarywq deprecated cuda line from
 __init__.py

---
 poppy/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/poppy/__init__.py b/poppy/__init__.py
index d3857ca6..31349954 100644
--- a/poppy/__init__.py
+++ b/poppy/__init__.py
@@ -62,9 +62,6 @@ class Conf(_config.ConfigNamespace):
                                               '"wisdom" for improved speed?')
     use_mkl = _config.ConfigItem(True, "Use Intel MKL for FFTs (assuming it is available). "
                                        "This has highest priority for CPU-based FFT over other FFT options, if multiple are set True.")
-
-    use_cuda = _config.ConfigItem(True, 'Use cuda for FFTs on GPU (assuming it' +
-            'is available)?')
     use_opencl = _config.ConfigItem(True, 'Use OpenCL for FFTs on GPU (assuming it' +
             'is available)?')
     use_cupy = _config.ConfigItem(True, 'Use CuPy for FFTs on GPU (assuming it' +