From 423ae8df633181e1bd36ed79958228474357ae85 Mon Sep 17 00:00:00 2001 From: Marshall Perrin Date: Fri, 24 Mar 2023 16:34:53 -0400 Subject: [PATCH 1/3] remove older CUDA implementation, now replaced by the CuPy implementation --- poppy/accel_math.py | 135 +++++----------------------------------- poppy/tests/test_fft.py | 41 ------------ 2 files changed, 17 insertions(+), 159 deletions(-) diff --git a/poppy/accel_math.py b/poppy/accel_math.py index c91d224f..5e9fadee 100644 --- a/poppy/accel_math.py +++ b/poppy/accel_math.py @@ -1,6 +1,6 @@ # accel_math.py # -# Various functions related to accelerated computations using FFTW, CUDA, numexpr, and related. +# Various functions related to accelerated computations using FFTW, CUPY, numexpr, and related. # import numpy as np import scipy @@ -40,16 +40,6 @@ _MKLFFT_AVAILABLE = False -try: - # try to import CUDA packages to see if they are available - import pyculib - from numba import cuda - _CUDA_PLANS = {} # plans for various array sizes already prepared - _CUDA_AVAILABLE = True -except ImportError: - pyculib = None - _CUDA_AVAILABLE = False - try: # try to import OpenCL packages to see if they are is available import pyopencl @@ -75,7 +65,6 @@ _CUPY_AVAILABLE = False _USE_CUPY = (conf.use_cupy and _CUPY_AVAILABLE) -_USE_CUDA = (conf.use_cuda and _CUDA_AVAILABLE) _USE_OPENCL = (conf.use_opencl and _OPENCL_AVAILABLE) _USE_NUMEXPR = (conf.use_numexpr and _NUMEXPR_AVAILABLE and not _USE_CUPY) _USE_FFTW = (conf.use_fftw and _FFTW_AVAILABLE) @@ -87,11 +76,10 @@ def update_math_settings(): """ Update the module-level math flags, based on user settings """ - global _USE_CUPY, _USE_CUDA, _USE_OPENCL, _USE_NUMEXPR, _USE_FFTW, _USE_MKL + global _USE_CUPY, _USE_OPENCL, _USE_NUMEXPR, _USE_FFTW, _USE_MKL global xp, _scipy _USE_CUPY = (conf.use_cupy and _CUPY_AVAILABLE) - _USE_CUDA = (conf.use_cuda and _CUDA_AVAILABLE) _USE_OPENCL = (conf.use_opencl and _OPENCL_AVAILABLE) _USE_NUMEXPR = (conf.use_numexpr and _NUMEXPR_AVAILABLE and not _USE_CUPY) _USE_FFTW = (conf.use_fftw and _FFTW_AVAILABLE) @@ -144,27 +132,17 @@ def _exp(x): return np.exp(x) def _fftshift(x): - """ FFT shifts of array contents, using CUDA if available. + """ FFT shifts of array contents, using CUDA/CuPY if available. Otherwise defaults to numpy. Note - TODO write an OpenCL version + Note 2 - This used to be an interesting function with a clever CUDA implementation + of fftshift, which has since been obsoleted by cupy's implementation. + This wrapper function could be refactored out in the future See also ifftshift """ - - N=x.shape[0] - # the CUDA fftshift is set up to work on blocks of 32, so - # N must be a multiple of 32. We check this rapidly using a bit mask: - # (x & 31)==0 is a ~20x faster equivalent of (np.mod(x,32)==0) - if (_USE_CUDA) & (N==x.shape[1]) & ((N & 31)==0): - blockdim = (32, 32) # threads per block - numBlocks = (int(N/blockdim[0]),int(N/blockdim[1])) - cufftShift_2D_kernel[numBlocks, blockdim](x.ravel(),N) - return x - elif _USE_CUPY: - return cp.fft.fftshift(x) - else: - return np.fft.fftshift(x) + return xp.fft.fftshift(x) def _ifftshift(x): """ Inverse FFT shifts of array contents, using CUDA if available. @@ -175,23 +153,14 @@ def _ifftshift(x): so we can use the same algorithm as fftshift. Note - TODO write an OpenCL version + Note 2 - This used to be an interesting function with a clever CUDA implementation + of fftshift, which has since been obsoleted by cupy's implementation. + This wrapper function could be refactored out in the future See also fftshift """ - N=x.shape[0] - # the CUDA fftshift is set up to work on blocks of 32, so - # N must be a multiple of 32. We check this rapidly using a bit mask: - # not (x & 31) is a ~20x faster equivalent of (np.mod(x,32)==0) - if (_USE_CUDA) & (N==x.shape[1]) & ((N & 31)==0): - blockdim = (32, 32) # threads per block - numBlocks = (int(N/blockdim[0]),int(N/blockdim[1])) - cufftShift_2D_kernel[numBlocks, blockdim](x.ravel(),N) - return x - elif _USE_CUPY: - return cp.fft.ifftshift(x) - else: - return np.fft.ifftshift(x) + return xp.fft.ifftshift(x) @@ -199,7 +168,7 @@ def _ifftshift(x): def fft_2d(wavefront, forward=True, normalization=None, fftshift=True): """ main entry point for FFTs, used in Wavefront._propagate_fft and elsewhere. This will invoke one of the following, depending on availability: - - CUDA on NVidia GPU + - CUDA CuPy on NVidia GPU - OpenCL on AMD GPU - FFTW on CPU - numpy on CPU @@ -229,7 +198,7 @@ def fft_2d(wavefront, forward=True, normalization=None, fftshift=True): """ ## To use a fast FFT, it must both be enabled and the library itself has to be present - global _USE_OPENCL, _USE_CUDA, _USE_CUPY # need to declare global in case we need to change it, below + global _USE_OPENCL, _USE_CUPY # need to declare global in case we need to change it, below t0 = time.time() # OpenCL cfFFT only can FFT certain array sizes. @@ -243,8 +212,6 @@ def fft_2d(wavefront, forward=True, normalization=None, fftshift=True): method = 'cupy (GPU)' elif _USE_OPENCL: method = 'pyopencl (OpenCL GPU)' - elif _USE_CUDA: - method = 'pyculib (CUDA GPU)' elif _USE_MKL: method = 'mkl_fft' elif _USE_FFTW: @@ -263,27 +230,7 @@ def fft_2d(wavefront, forward=True, normalization=None, fftshift=True): wavefront = _ifftshift(wavefront) t1 = time.time() - if _USE_CUDA: - if normalization is None: - normalization = 1./wavefront.shape[0] # regardless of direction, for CUDA - - # We need a CUDA FFT plan for each size and shape of FFT. - # The plans can be cached for reuse, since they cost some - # 10s of milliseconds to create - params = (wavefront.shape, wavefront.dtype, wavefront.dtype) - try: - cufftplan = _CUDA_PLANS[params] - except KeyError: - cufftplan = pyculib.fft.FFTPlan(*params) - _CUDA_PLANS[params] = cufftplan - - # perform FFT on GPU, and return results in place to same array. - if forward: - cufftplan.forward(wavefront, out=wavefront) - else: - cufftplan.inverse(wavefront, out=wavefront) - - elif _USE_OPENCL: + if _USE_OPENCL: if normalization is None: normalization = 1./wavefront.shape[0] if forward else wavefront.shape[0] @@ -294,13 +241,13 @@ def fft_2d(wavefront, forward=True, normalization=None, fftshift=True): event.wait() wavefront[:] = wf_on_gpu.get() del wf_on_gpu - - if _USE_CUPY: + + elif _USE_CUPY: do_fft = cp.fft.fft2 if forward else cp.fft.ifft2 if normalization is None: normalization = 1./wavefront.shape[0] if forward else wavefront.shape[0] wavefront = do_fft(wavefront) - + elif _USE_MKL: # Intel MKL is a drop-in replacement for numpy fft but much faster do_fft = mkl_fft.fft2 if forward else mkl_fft.ifft2 @@ -398,42 +345,6 @@ def get_opencl_context(): return (_OPENCL_STATE['context'], _OPENCL_STATE['queue']) - - - -if _CUDA_AVAILABLE: - @cuda.jit() - def cufftShift_2D_kernel(data, N): - """ - adopted CUDA FFT shift code from: - https://github.com/marwan-abdellah/cufftShift - (GNU Lesser Public License) - """ - - # // 2D Slice & 1D Line - sLine = N - sSlice = N * N - # // Transformations Equations - sEq1 = int((sSlice + sLine) / 2) - sEq2 = int((sSlice - sLine) / 2) - x, y = cuda.grid(2) - # // Thread Index Converted into 1D Index - index = (y * N) + x - - if x < N / 2: - if y < N / 2: - # // First Quad - temp = data[index] - data[index] = data[index + sEq1] - # // Third Quad - data[index + sEq1] = temp - else: - if y < N / 2: - # // Second Quad - temp = data[index] - data[index] = data[index + sEq2] - data[index + sEq2] = temp - # ################################################################## # # Performance benchmarks @@ -498,20 +409,9 @@ def benchmark_fft(npix=2048, iterations=20, double_precision=True): else: time_mkl = np.NaN - if poppy.accel_math._CUDA_AVAILABLE: - print("Timing performance with CUDA:") - poppy.conf.use_cuda = True - poppy.conf.use_opencl = False - update_math_settings() - time_cuda = timer.timeit(number=iterations) / iterations - print(" {:.3f} s".format(time_cuda)) - else: - time_cuda = np.NaN - if poppy.accel_math._OPENCL_AVAILABLE: print("Timing performance with OpenCL:") poppy.conf.use_opencl = True - poppy.conf.use_cuda = False update_math_settings() time_opencl = timer.timeit(number=iterations) / iterations print(" {:.3f} s".format(time_opencl)) @@ -526,7 +426,6 @@ def benchmark_fft(npix=2048, iterations=20, double_precision=True): 'fftw': time_fftw, 'numexpr': time_numexpr, 'mkl': time_mkl, - 'cupy': time_cuda, 'opencl': time_opencl} diff --git a/poppy/tests/test_fft.py b/poppy/tests/test_fft.py index b8a8f03b..446a6bcc 100644 --- a/poppy/tests/test_fft.py +++ b/poppy/tests/test_fft.py @@ -172,7 +172,6 @@ def test_parity_FFT_forward_inverse(display=False): # Test different FFT algorithms for consistency: # - numpy # - fftw -# - CUDA / pyculib # - OpenCL / clfft / gpyfft # # The test method is the same for all: calculate PSFs for an @@ -235,46 +234,6 @@ def test_pyfftw_vs_numpyfft(verbose=False): conf.use_fftw, conf.use_cuda, conf.use_opencl, conf.use_cupy = defaults -@pytest.mark.skipif(accel_math._CUDA_AVAILABLE is False, reason="CUDA not available") -def test_cuda_vs_numpyfft(verbose=False): - """ Create an optical system with 2 parity test apertures, - propagate light through it, and compare that we get the same results from both numpy and CUDA""" - - defaults = conf.use_fftw, conf.use_cuda, conf.use_opencl - - conf.use_fftw = False - conf.use_opencl = False - - sys = setup_test_osys() - - conf.use_cuda = False - psf_numpy, intermediates_numpy = sys.calc_psf(wavelength=1e-6, return_intermediates=True) - - conf.use_cuda = True - psf_cuda, intermediates_cuda = sys.calc_psf(wavelength=1e-6, return_intermediates=True) - - # check the final PSFs are consistent - assert np.abs(psf_cuda[0].data-psf_numpy[0].data).max() < 1e-6 - - # Check flux conservation for the intermediate arrays behaves properly - intermediates = intermediates_cuda - epsilon = np.finfo(intermediates[0].wavefront.dtype).eps - total_int_input = intermediates_numpy[0].total_intensity - for i in [1,2]: - assert np.abs(intermediates[i].total_intensity - total_int_input) < 5*epsilon - - # Check flux in output array is about 0.5% less than input array (due to finite FOV) - expected = 0.004949550538272617927759 - assert np.abs(intermediates[3].total_intensity - total_int_input) - expected < 5*epsilon - - if verbose: - print ("PSF difference: ", np.abs(psf_cuda[0].data-psf_numpy[0].data).max()) - for i in [1,2]: - print(" Int. WF {} intensity diff: {}".format(i, np.abs(intermediates[i].total_intensity-total_int_input)) ) - print(" Final PSF intensity diff:", np.abs(intermediates[3].total_intensity-total_int_input) - expected) - - conf.use_fftw, conf.use_cuda, conf.use_opencl = defaults - @pytest.mark.skipif(accel_math._OPENCL_AVAILABLE is False, reason="OPENCL not available") def test_opencl_vs_numpyfft(verbose=False): """ Create an optical system with 2 parity test apertures, From aac38985778d5b2c9690f82ae915de01695f98e6 Mon Sep 17 00:00:00 2001 From: Marshall Perrin Date: Fri, 17 Feb 2023 10:18:43 -0700 Subject: [PATCH 2/3] Add initial GPU docs, and start updating release notes including for GPU code --- docs/gpu_acceleration.rst | 78 +++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + docs/relnotes.rst | 18 ++++++--- 3 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 docs/gpu_acceleration.rst diff --git a/docs/gpu_acceleration.rst b/docs/gpu_acceleration.rst new file mode 100644 index 00000000..20440e45 --- /dev/null +++ b/docs/gpu_acceleration.rst @@ -0,0 +1,78 @@ +GPU Accelerated Optical Calculations +==================================== + + + +.. admonition:: Placeholder docs + This page is a placeholder for more complete documentation to be added later about usage of GPUs for fast optical calculations. + + + +Thanks to team members Kian Milani and Ewan Douglas (University of Arizona), poppy now includes a +high performance option using NVidia GPUs to significantly accelerate optical calculations, in some +cases by ~20x to 80x. + +This implementation seeks to perform all calculations on the GPU until the end of propagation. This +reduces time for calculations as arrays no longer need to be transferred between GPU memory and +standard memory when performing different calculations. It also allows GPU acceleration of the +majority of calculations performed during an optical propagation (i.e. creating models of optical +elements and applying them to wavefronts happens on the GPU, as well as the propagation calculations +from one plane to another.) + +An updated implementation using `CuPy ` replaces +initial earlier support for CUDA using pyculib and numba.cuda. (That initial implementation has been +removed since the CuPy implementation is much better performing.) + +Note, because cupy is used as a replacement for numpy at import time, it is a bit tricky to toggle +between GPU and CPU calculations during the same python session. Doing so is advanced usage, and +while it can be useful in some cases for debugging or benchmarking, it's not fully supported or +recommended to try to switch between calculation backends within the same session. + + +**What about AMD GPUs?** + +There also exists partial/earlier support for OpenCL for AMD GPUs, using the `pyopencl` and `gpyfft` +packages. This provides much less performance gains than the CuPy version, however, since only +FFTs are performed on-GPU, not other parts of the optical propagation calculations. + +**What about Apple Silicon GPUs?** + +Poppy does not yet have support for the specialized GPU hardware in Apple Silicon M1/M2 and similar. +For these machines, plain numpy is the best option. + +Requirements and Setup +---------------------- + + +Requires NVidia GPU hardware + +Requires CuPy > 10.0. Install from https://cupy.dev following the `CuPy installation docs `_ + +Also requires the cupyx GPU-accelerated version of scipy. + + +Performance Comparisons +----------------------- + + + +Computation comparisons have been performed to illustrate the benefit of this accelerated computing +feature. Below are comparisons of the times required for a PSF to be calculated for varying array +sizes using the MKL FFT option versus the CuPy calculations. The optical systems tested had 5 +different surfaces/optics. + +Performances will naturally vary depending on the compute hardware used. The system used for these +comparisons was the University of Arizona’s HPC Puma nodes. The node utilized 32 AMD EPYC 7642 CPUs +and the NVIDIA Tesla V100S GPU. + ++-------------------+--------------+------------------------+-------------------------+----------------------+ +| Propagation Type | Array Size | MKL Method Times [s] | CuPy Method Times [s] | Speed Up Factor | ++===============+=======+=======+===============+=======+ +| Fraunhofer | 1024 | 0.218 | 0.0261 | 8.35 | +| Fraunhofer | 2048 | 0.755 | 0.0294 | 25.7 | +| Fraunhofer | 4096 | 3.36 | 0.0423 | 79.4 | +| Fresnel | 1024 | 0.714 | 0.0438 | 16.3 | +| Fresnel | 2048 | 4.16 | 0.0845 | 49.2 | +| Fresnel | 4096 | 17.5 | 0.225 | 77.8 | ++---------------+-------+-------+---------------+-------+ + diff --git a/docs/index.rst b/docs/index.rst index d1e727e7..18bcaced 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -67,6 +67,7 @@ Contents about.rst performance.rst fft_optimization.rst + gpu_acceleration.rst dev_notes.rst diff --git a/docs/relnotes.rst b/docs/relnotes.rst index 04598c5d..3880ea63 100644 --- a/docs/relnotes.rst +++ b/docs/relnotes.rst @@ -5,15 +5,23 @@ Release Notes For a list of contributors, see :ref:`about`. - 1.1.0 - ----- +.. _rel1.1.0: - .. _rel1.1.0:: +1.1.0 +----- + +This release introduces support for much faster (20-80x) optical calculations using GPU acceleration via the CuPy library for NVidia GPUs. Credit to Kian Milani (:user:`kian1337`) for this significant and complex improvement. + + +*2023 date TBD* - *2023 date TBD* + **Performance Enhancements:** + * Implement CuPy GPU acceleration for calculations throughout poppy. (PR :pr:`499` by :user:`kian1377` and :user:`mperrin`) **Software Infrastructure Updates and Internals:** - * Increase all CI test versions by 1, removing Python 3.8 and adding Python 3.11. Update minimum supported versions of astropy and numpy as well. (PR TBD by :user:`mperrin`) + * Increase all CI test versions by 1, removing Python 3.8 and adding Python 3.11. Update minimum supported versions of astropy and numpy as well. (PR :pr:`551` by :user:`BradleySappington`) + * Updates to dependencies (various PRs) + * Bug fix for a recent change in numpy interface (:pr:`545` by :user:`adambolton`) 1.0.3 From e4eac9ba1ccec0de55d7aa1c5cc520970725f701 Mon Sep 17 00:00:00 2001 From: Marshall Perrin Date: Tue, 9 May 2023 17:28:33 -0400 Subject: [PATCH 3/3] remove unnecessarywq deprecated cuda line from __init__.py --- poppy/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/poppy/__init__.py b/poppy/__init__.py index d3857ca6..31349954 100644 --- a/poppy/__init__.py +++ b/poppy/__init__.py @@ -62,9 +62,6 @@ class Conf(_config.ConfigNamespace): '"wisdom" for improved speed?') use_mkl = _config.ConfigItem(True, "Use Intel MKL for FFTs (assuming it is available). " "This has highest priority for CPU-based FFT over other FFT options, if multiple are set True.") - - use_cuda = _config.ConfigItem(True, 'Use cuda for FFTs on GPU (assuming it' + - 'is available)?') use_opencl = _config.ConfigItem(True, 'Use OpenCL for FFTs on GPU (assuming it' + 'is available)?') use_cupy = _config.ConfigItem(True, 'Use CuPy for FFTs on GPU (assuming it' +