diff --git a/README.md b/README.md
index 1993ce568..fd5b1a9f9 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # RTE+RRTMGP
 
-This is the repository for RTE+RRTMGP, a set of codes for computing radiative fluxes in planetary atmospheres. RTE+RRTMGP is described in a [manuscript revised May 10, 2019](https://doi.org/10.1002/essoar.10500964.1) to [Journal of Advances in Modeling Earth Systems](http://james.agu.org). 
+This is the repository for RTE+RRTMGP, a set of codes for computing radiative fluxes in planetary atmospheres. RTE+RRTMGP is described in a [paper](https://doi.org/10.1029/2019MS001621) in [Journal of Advances in Modeling Earth Systems](http://james.agu.org). 
 
 RRTMGP uses a k-distribution to provide an optical description (absorption and possibly Rayleigh optical depth) of the gaseous atmosphere, along with the relevant source functions, on a pre-determined spectral grid given temperatures, pressures, and gas concentration. The k-distribution currently distributed with this package is applicable to the Earth's atmosphere under present-day, pre-industrial, and 4xCO2 conditions.
 
@@ -11,13 +11,13 @@ Example programs and documenation are evolving - please see examples/ in the rep
 ## Building the libraries.
 
 1. `cd build`
-2. Set environment variables `FC` (the Fortran 2003 compiler) and `FCFLAGS` (compiler flags). Alternately create a Makefile.conf that sets these variables. You could also link to an existing file. 
-3. Set environment variable `RTE_KERNELS` to `openacc` if you want the OpenACC kernels rather than the default. 
+2. Set environment variables `FC` (the Fortran 2003 compiler) and `FCFLAGS` (compiler flags). Alternately create a Makefile.conf that sets these variables. You could also link to an existing file.
+3. Set environment variable `RTE_KERNELS` to `openacc` if you want the OpenACC kernels rather than the default.
 4. `make`
 
 ## Building and running the examples.
 
-1. From the root RTE+RRTMGP directory: `cd examples/rfmip-clear-sky`. 
-2. Set environment variables `NCHOME` and `NFHOME` to the root directories of the Netcdf C and Fortran libraries respectively. Set environment variable `RRTMGP_DIR` to the location of the libraries (`../../build`) in the default layout). 
+1. From the root RTE+RRTMGP directory: `cd examples/rfmip-clear-sky`.
+2. Set environment variables `NCHOME` and `NFHOME` to the root directories of the Netcdf C and Fortran libraries respectively. Set environment variable `RRTMGP_DIR` to the location of the libraries (`../../build`) in the default layout).
 3. `make`
 4. Python scripts are provided to stage the files needed (`stage_files.py`),  run the examples `run-rfmip-examples.py`), and compare to results computed on an example host (`compare-to-reference.py`). The python scripts require modules xarray and netCDF.  
diff --git a/build/Makefile.conf.ifort b/build/Makefile.conf.ifort
index 3e6c66648..b2b47bfb3 100644
--- a/build/Makefile.conf.ifort
+++ b/build/Makefile.conf.ifort
@@ -8,8 +8,8 @@ export FC = ifort
 #
 # Optimized
 #
-export FCFLAGS  += -m64 -O3 -traceback -assume realloc_lhs -extend-source 132
-export F77FLAGS += -m64 -O3 -traceback
+export FCFLAGS  += -m64 -O3 -g -traceback -assume realloc_lhs -extend-source 132
+export F77FLAGS += -m64 -O3 -g -traceback
 # can add  -qopt-report-phase=vec
 #
 # Debugging
diff --git a/build/Makefile.conf.pgfortran-cscs b/build/Makefile.conf.pgfortran-cscs
index 19eafc6e8..743a25ff1 100644
--- a/build/Makefile.conf.pgfortran-cscs
+++ b/build/Makefile.conf.pgfortran-cscs
@@ -2,10 +2,11 @@
 
 # Load the following modules to compile with PGI for CPU
 #
-#  module swap PrgEnv-cray PrgEnv-pgi
-#  module swap pgi pgi/18.10.0
-#  module load cray-netcdf cray-hdf5
-#  module unload cray-libsci_acc
+# module load cdt/19.06
+# module swap PrgEnv-cray PrgEnv-pgi
+# module load cray-netcdf cray-hdf5
+# module load craype-accel-nvidia60
+# module unload cray-libsci_acc
 #
 #
 # Fortran compiler command
diff --git a/build/Makefile.conf.pgfortran-cscs-gpu b/build/Makefile.conf.pgfortran-cscs-gpu
index 21288b37f..571589b1d 100644
--- a/build/Makefile.conf.pgfortran-cscs-gpu
+++ b/build/Makefile.conf.pgfortran-cscs-gpu
@@ -2,10 +2,11 @@
 
 # Load the following modules
 #
-#  module swap PrgEnv-cray PrgEnv-pgi
-#  module load cray-netcdf cray-hdf5
-#  module load craype-accel-nvidia60
-#  module unload cray-libsci_acc
+# module load cdt/19.06
+# module swap PrgEnv-cray PrgEnv-pgi
+# module load cray-netcdf cray-hdf5
+# module load craype-accel-nvidia60
+# module unload cray-libsci_acc
 #
 #
 # Fortran compiler command
@@ -14,7 +15,7 @@ NCHOME =
 
 # Fortran compiler flags
 #FCFLAGS = -g -Minfo -Mbounds -Mchkptr -Mstandard -Kieee -Mchkstk -Mipa=fast,inline -acc -ta=tesla:6.5
-FCFLAGS = -g -ta=tesla:cc60,cuda9.1 -Minfo -Mbounds -Mchkptr -Mstandard -Kieee -Mchkstk                   -Mallocatable=03
+FCFLAGS = -g -ta=tesla:cc60,cuda9.2 -Minfo -Mbounds -Mchkptr -Mstandard -Kieee -Mchkstk -Mallocatable=03
 
 # Fortran .mod files, e.g. -I<include dir> if you have headers in a nonstandard directory <include dir>
 FCINCLUDE =
diff --git a/examples/mo_load_coefficients.F90 b/examples/mo_load_coefficients.F90
index 9c3a88093..edf2ea4a5 100644
--- a/examples/mo_load_coefficients.F90
+++ b/examples/mo_load_coefficients.F90
@@ -97,7 +97,7 @@ subroutine load_and_init(kdist, filename, available_gases)
     !
     ! How big are the various arrays?
     !
-    if(nf90_open(trim(fileName), NF90_WRITE, ncid) /= NF90_NOERR) &
+    if(nf90_open(trim(fileName), NF90_NOWRITE, ncid) /= NF90_NOERR) &
       call stop_on_err("load_and_init(): can't open file " // trim(fileName))
     ntemps            = get_dim_size(ncid,'temperature')
     npress            = get_dim_size(ncid,'pressure')
diff --git a/examples/rfmip-clear-sky/Makefile.libs.pgfortran-cscs b/examples/rfmip-clear-sky/Makefile.libs.pgfortran-cscs
index c71d62c45..3bd5a213b 100644
--- a/examples/rfmip-clear-sky/Makefile.libs.pgfortran-cscs
+++ b/examples/rfmip-clear-sky/Makefile.libs.pgfortran-cscs
@@ -1,8 +1,11 @@
-# Load the following modules
+# Load the following modules and set the library path
 #
-#  module swap PrgEnv-cray PrgEnv-pgi
-#  module load cray-netcdf cray-hdf5
-#  module unload cray-libsci_acc
+# module load cdt/19.06
+# module swap PrgEnv-cray PrgEnv-pgi
+# module load cray-netcdf cray-hdf5
+# module load craype-accel-nvidia60
+# module unload cray-libsci_acc
+# export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
 
 export FC = ftn
 export FCFLAGS = -g -Minfo -Mbounds -Mchkptr -Mstandard -Kieee -Mchkstk -Mipa=fast,inline -Mallocatable=03
diff --git a/examples/rfmip-clear-sky/compare-to-reference.py b/examples/rfmip-clear-sky/compare-to-reference.py
index ed4cb5bdd..dc805a605 100755
--- a/examples/rfmip-clear-sky/compare-to-reference.py
+++ b/examples/rfmip-clear-sky/compare-to-reference.py
@@ -5,12 +5,31 @@
 import os
 import numpy as np
 import xarray as xr
+import urllib.request
 
 ref_dir  = "reference"
 tst_dir = "."
 
-rrtmg_suffix = "_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"
+#
+# Download reference results
+#
+print("Downloading reference results")
+ref_dir = "./reference/"
+if not os.path.exists(ref_dir):
+    os.makedirs(ref_dir)
+urllib.request.urlretrieve("https://owncloud.gwdg.de/index.php/s/kbhl3JOSccGtR0m/download", \
+                           os.path.join(ref_dir, "rld_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"))
+urllib.request.urlretrieve("https://owncloud.gwdg.de/index.php/s/iFa28GFxRaNGKU1/download", \
+                           os.path.join(ref_dir, "rlu_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"))
+urllib.request.urlretrieve("https://owncloud.gwdg.de/index.php/s/uCemCHlGxbGK0gJ/download", \
+                           os.path.join(ref_dir, "rsd_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"))
+urllib.request.urlretrieve("https://owncloud.gwdg.de/index.php/s/l8ZG28j9ttZWD9r/download", \
+                           os.path.join(ref_dir, "rsu_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"))
 
+#
+# Comparing reference and test results 
+#
+rrtmg_suffix = "_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"
 tst = xr.open_mfdataset(os.path.join(tst_dir, "r??" + rrtmg_suffix))
 ref = xr.open_mfdataset(os.path.join(ref_dir, "r??" + rrtmg_suffix))
 
diff --git a/examples/rfmip-clear-sky/rrtmgp_rfmip_lw.F90 b/examples/rfmip-clear-sky/rrtmgp_rfmip_lw.F90
index 464239062..5c43e5d1d 100644
--- a/examples/rfmip-clear-sky/rrtmgp_rfmip_lw.F90
+++ b/examples/rfmip-clear-sky/rrtmgp_rfmip_lw.F90
@@ -225,10 +225,9 @@ program rrtmgp_rfmip_lw
   ! NOTE: these are causing problems right now, most likely due to a compiler
   ! bug related to the use of Fortran classes on the GPU.
   !
-  !!!$acc enter data create(sfc_emis_spec)
-  !!$acc enter data create(optical_props, optical_props%tau)
-  !!$acc enter data create(source, source%lay_source, source%lev_source_inc, source%lev_source_dec)
-  !!$acc enter data create(source%sfc_source, source%band2gpt, source%gpt2band, source%band_lims_wvn)
+  !$acc enter data create(sfc_emis_spec)
+  !$acc enter data create(optical_props, optical_props%tau)
+  !$acc enter data create(source, source%lay_source, source%lev_source_inc, source%lev_source_dec, source%sfc_source)
   ! --------------------------------------------------
 #ifdef USE_TIMING
   !
@@ -251,7 +250,7 @@ program rrtmgp_rfmip_lw
     ! Expand the spectrally-constant surface emissivity to a per-band emissivity for each column
     !   (This is partly to show how to keep work on GPUs using OpenACC)
     !
-    !!$acc parallel loop collapse(2) copyin(sfc_emis)
+    !$acc parallel loop collapse(2) copyin(sfc_emis)
     do icol = 1, block_size
       do ibnd = 1, nbnd
         sfc_emis_spec(ibnd,icol) = sfc_emis(icol,b)
@@ -299,10 +298,10 @@ program rrtmgp_rfmip_lw
   ret = gptlpr(block_size)
   ret = gptlfinalize()
 #endif
-  !!!$acc exit data delete(sfc_emis_spec)
-  !!$acc exit data delete(optical_props%tau, optical_props)
-  !!$acc exit data delete(source%lay_source, source%lev_source_inc, source%lev_source_dec, source%sfc_source)
-  !!$acc exit data delete(source%band2gpt, source%gpt2band, source%band_lims_wvn, source)
+  !$acc exit data delete(sfc_emis_spec)
+  !$acc exit data delete(optical_props%tau, optical_props)
+  !$acc exit data delete(source%lay_source, source%lev_source_inc, source%lev_source_dec, source%sfc_source)
+  !$acc exit data delete(source)
   ! --------------------------------------------------m
   call unblock_and_write(trim(flxup_file), 'rlu', flux_up)
   call unblock_and_write(trim(flxdn_file), 'rld', flux_dn)
diff --git a/examples/rfmip-clear-sky/rrtmgp_rfmip_sw.F90 b/examples/rfmip-clear-sky/rrtmgp_rfmip_sw.F90
index 845050a89..cdf2e72c8 100644
--- a/examples/rfmip-clear-sky/rrtmgp_rfmip_sw.F90
+++ b/examples/rfmip-clear-sky/rrtmgp_rfmip_sw.F90
@@ -46,6 +46,10 @@ program rrtmgp_rfmip_sw
   !
   use mo_rte_kind,           only: wp
   !
+  ! Array utilities
+  !
+  use mo_util_array,         only: zero_array
+  !
   ! Optical properties of the atmosphere as array of values
   !   In the longwave we include only absorption optical depth (_1scl)
   !   Shortwave calculations use optical depth, single-scattering albedo, asymmetry parameter (_2str)
@@ -224,11 +228,9 @@ program rrtmgp_rfmip_sw
            flux_dn(block_size, nlay+1, nblocks))
   allocate(mu0(block_size), sfc_alb_spec(nbnd,block_size))
   call stop_on_err(optical_props%alloc_2str(block_size, nlay, k_dist))
-  ! Handle GPU data. Leave mu0, sfc_alb_spec, toa_flux, and def_tsi on CPU for
-  ! now, and let compiler or CUDA runtime handle data movement because not
-  ! everything is in kernels at the next level down yet.
-  !!$acc enter data create(optical_props, optical_props%tau, optical_props%ssa, optical_props%g)
-  !!!$acc enter data create(mu0,sfc_alb_spec,toa_flux,def_tsi)
+  !$acc enter data create(optical_props, optical_props%tau, optical_props%ssa, optical_props%g)
+  !$acc enter data create (toa_flux, def_tsi)
+  !$acc enter data create (sfc_alb_spec, mu0)
   ! --------------------------------------------------
 #ifdef USE_TIMING
   !
@@ -265,25 +267,27 @@ program rrtmgp_rfmip_sw
 #endif
     ! Boundary conditions
     !   (This is partly to show how to keep work on GPUs using OpenACC in a host application)
-    !
     ! What's the total solar irradiance assumed by RRTMGP?
-    !  The first two loops could be more expressed more ompactly as def_tsi(1:block_size) = sum(toa_flux, dim=2)
     !
-    !!!$acc parallel loop
-    do icol = 1, block_size
-      def_tsi(icol) = toa_flux(icol, 1)
-    end do
-    !!!$acc parallel loop collapse(2)
+#ifdef _OPENACC
+    call zero_array(block_size, def_tsi)
+    !$acc parallel loop collapse(2) copy(def_tsi) copyin(toa_flux)
     do igpt = 1, ngpt
       do icol = 1, block_size
-        !!$acc atomic update
+        !$acc atomic update
         def_tsi(icol) = def_tsi(icol) + toa_flux(icol, igpt)
       end do
     end do
+#else
+    !
+    ! More compactly...
+    !
+    def_tsi(1:block_size) = sum(toa_flux, dim=2)
+#endif
     !
     ! Normalize incoming solar flux to match RFMIP specification
     !
-    !!!$acc parallel loop collapse(2)
+    !$acc parallel loop collapse(2) copyin(total_solar_irradiance, def_tsi) copy(toa_flux)
     do igpt = 1, ngpt
       do icol = 1, block_size
         toa_flux(icol,igpt) = toa_flux(icol,igpt) * total_solar_irradiance(icol,b)/def_tsi(icol)
@@ -292,7 +296,7 @@ program rrtmgp_rfmip_sw
     !
     ! Expand the spectrally-constant surface albedo to a per-band albedo for each column
     !
-    !!!$acc parallel loop collapse(2)
+    !$acc parallel loop collapse(2) copyin(surface_albedo)
     do icol = 1, block_size
       do ibnd = 1, nbnd
         sfc_alb_spec(ibnd,icol) = surface_albedo(icol,b)
@@ -301,7 +305,7 @@ program rrtmgp_rfmip_sw
     !
     ! Cosine of the solar zenith angle
     !
-    !!!$acc parallel loop
+    !$acc parallel loop copyin(solar_zenith_angle, usecol)
     do icol = 1, block_size
       mu0(icol) = merge(cos(solar_zenith_angle(icol,b)*deg_to_rad), 1._wp, usecol(icol,b))
     end do
@@ -341,8 +345,9 @@ program rrtmgp_rfmip_sw
   ret = gptlpr(block_size)
   ret = gptlfinalize()
 #endif
-  !!$acc exit data delete(optical_props, optical_props%tau, optical_props%ssa, optical_props%g)
-  !!!$acc exit data delete(mu0,sfc_alb_spec,toa_flux,def_tsi)
+  !$acc exit data delete(optical_props%tau, optical_props%ssa, optical_props%g, optical_props)
+  !$acc exit data delete(sfc_alb_spec, mu0)
+  !$acc exit data delete(toa_flux, def_tsi)
   ! --------------------------------------------------
   call unblock_and_write(trim(flxup_file), 'rsu', flux_up)
   call unblock_and_write(trim(flxdn_file), 'rsd', flux_dn)
diff --git a/examples/rfmip-clear-sky/stage_files.py b/examples/rfmip-clear-sky/stage_files.py
index 35f340a4f..a1556e2ab 100755
--- a/examples/rfmip-clear-sky/stage_files.py
+++ b/examples/rfmip-clear-sky/stage_files.py
@@ -32,21 +32,4 @@
 urllib.request.urlretrieve(conds_url,     conds_file)
 print("Dowloading scripts for generating output templates")
 urllib.request.urlretrieve(templ_scr_url, templ_scr)
-#%run -i generate-output-file-templates.py --source_id RTE-RRTMGP-181204
 subprocess.run(["python3", templ_scr, "--source_id", "RTE-RRTMGP-181204"])
-
-#
-# Reference results
-#
-print("Downloading reference results")
-ref_dir = "./reference/"
-if not os.path.exists(ref_dir):
-    os.makedirs(ref_dir)
-urllib.request.urlretrieve("https://owncloud.gwdg.de/index.php/s/kbhl3JOSccGtR0m/download", \
-                           os.path.join(ref_dir, "rld_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"))
-urllib.request.urlretrieve("https://owncloud.gwdg.de/index.php/s/iFa28GFxRaNGKU1/download", \
-                           os.path.join(ref_dir, "rlu_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"))
-urllib.request.urlretrieve("https://owncloud.gwdg.de/index.php/s/uCemCHlGxbGK0gJ/download", \
-                           os.path.join(ref_dir, "rsd_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"))
-urllib.request.urlretrieve("https://owncloud.gwdg.de/index.php/s/l8ZG28j9ttZWD9r/download", \
-                           os.path.join(ref_dir, "rsu_Efx_RTE-RRTMGP-181204_rad-irf_r1i1p1f1_gn.nc"))
diff --git a/rrtmgp/kernels-openacc/mo_gas_optics_kernels.F90 b/rrtmgp/kernels-openacc/mo_gas_optics_kernels.F90
index 0889b8807..d51319fad 100644
--- a/rrtmgp/kernels-openacc/mo_gas_optics_kernels.F90
+++ b/rrtmgp/kernels-openacc/mo_gas_optics_kernels.F90
@@ -17,10 +17,6 @@
 module mo_gas_optics_kernels
   use mo_rte_kind,      only: wp, wl
   implicit none
-
-  interface zero_array
-    module procedure zero_array_3D, zero_array_4D
-  end interface
 contains
   ! --------------------------------------------------------------------------------------
   ! Compute interpolation coefficients
@@ -454,7 +450,8 @@ subroutine gas_optical_depths_minor(ncol,nlay,ngpt,        &
                 tau_minor = 0._wp
                 iflav = gpt_flv(idx_tropo,igpt) ! eta interpolation depends on flavor
                 minor_loc = minor_start + (igpt - gptS) ! add offset to starting point
-                kminor_loc = interpolate2D(fminor(:,:,iflav,icol,ilay), kminor, minor_loc, jeta(:,iflav,icol,ilay), jtemp(icol,ilay))
+                kminor_loc = interpolate2D(fminor(:,:,iflav,icol,ilay), kminor, minor_loc, &
+                                           jeta(:,iflav,icol,ilay), jtemp(icol,ilay))
                 tau_minor = kminor_loc * scaling
                 !$acc atomic update
                 tau(igpt,ilay,icol) = tau(igpt,ilay,icol) + tau_minor
@@ -553,7 +550,7 @@ subroutine compute_Planck_source(                        &
     real(wp) :: planck_function(nbnd,nlay+1,ncol)
     ! -----------------
 
-    !$acc enter data copyin(tlay,tlev,tsfc,fmajor,jeta,tropo,jtemp,jpress,gpoint_bands,temp_ref_min,totplnk_delta,pfracin,totplnk,gpoint_flavor,one)
+    !$acc enter data copyin(tlay,tlev,tsfc,fmajor,jeta,tropo,jtemp,jpress,gpoint_bands,pfracin,totplnk,gpoint_flavor,one)
     !$acc enter data create(sfc_src,lay_src,lev_src_inc,lev_src_dec)
     !$acc enter data create(pfrac,planck_function)
 
@@ -636,9 +633,9 @@ subroutine compute_Planck_source(                        &
       end do ! ilay
     end do ! icol
 
-    !$acc exit data delete(tlay,tlev,tsfc,fmajor,jeta,tropo,jtemp,jpress,gpoint_bands,temp_ref_min,totplnk_delta,pfracin,totplnk,gpoint_flavor,one)
-    !$acc exit data copyout(sfc_src,lay_src,lev_src_inc,lev_src_dec)
+    !$acc exit data delete(tlay,tlev,tsfc,fmajor,jeta,tropo,jtemp,jpress,gpoint_bands,pfracin,totplnk,gpoint_flavor,one)
     !$acc exit data delete(pfrac,planck_function)
+    !$acc exit data copyout(sfc_src,lay_src,lev_src_inc,lev_src_dec)
 
   end subroutine compute_Planck_source
   ! ----------------------------------------------------------
@@ -784,42 +781,4 @@ subroutine combine_and_reorder_nstr(ncol, nlay, ngpt, nmom, tau_abs, tau_rayleig
     end do
   end subroutine combine_and_reorder_nstr
   ! ----------------------------------------------------------
-  subroutine zero_array_3D(ni, nj, nk, array) bind(C, name="zero_array_3D")
-    integer, intent(in) :: ni, nj, nk
-    real(wp), dimension(ni, nj, nk), intent(out) :: array
-    ! -----------------------
-    integer :: i,j,k
-    ! -----------------------
-    !$acc parallel loop collapse(3) &
-    !$acc&     copyout(array(:ni,:nj,:nk))
-    do k = 1, nk
-      do j = 1, nj
-        do i = 1, ni
-          array(i,j,k) = 0.0_wp
-        end do
-      end do
-    end do
-
-  end subroutine zero_array_3D
-  ! ----------------------------------------------------------
-  subroutine zero_array_4D(ni, nj, nk, nl, array) bind(C, name="zero_array_4D")
-    integer, intent(in) :: ni, nj, nk, nl
-    real(wp), dimension(ni, nj, nk, nl), intent(out) :: array
-    ! -----------------------
-    integer :: i,j,k,l
-    ! -----------------------
-    !$acc parallel loop collapse(4) &
-    !$acc&     copyout(array(:ni,:nj,:nk,:nl))
-    do l = 1, nl
-      do k = 1, nk
-        do j = 1, nj
-          do i = 1, ni
-            array(i,j,k,l) = 0.0_wp
-          end do
-        end do
-      end do
-    end do
-
-  end subroutine zero_array_4D
-  ! ----------------------------------------------------------
 end module mo_gas_optics_kernels
diff --git a/rrtmgp/kernels-openacc/mo_reorder_kernels.F90 b/rrtmgp/kernels-openacc/mo_reorder_kernels.F90
deleted file mode 100644
index ab683ac09..000000000
--- a/rrtmgp/kernels-openacc/mo_reorder_kernels.F90
+++ /dev/null
@@ -1,61 +0,0 @@
-! This code is part of
-! RRTM for GCM Applications - Parallel (RRTMGP)
-!
-! Eli Mlawer and Robert Pincus
-! Andre Wehe and Jennifer Delamere
-! email:  rrtmgp@aer.com
-!
-! Copyright 2018,  Atmospheric and Environmental Research and
-! Regents of the University of Colorado.  All right reserved.
-!
-! Use and duplication is permitted under the terms of the
-!    BSD 3-clause license, see http://opensource.org/licenses/BSD-3-Clause
-!
-! Description: Kernels to permute arrays
-
-module mo_reorder_kernels
-  use mo_rte_kind,      only: wp
-  implicit none
-contains
-  ! ----------------------------------------------------------------------------
-  subroutine reorder_123x312_kernel(d1, d2, d3, array_in, array_out) &
-      bind(C, name = "reorder_123x312_kernel")
-    integer,                         intent( in) :: d1, d2, d3
-    real(wp), dimension(d1, d2, d3), intent( in) :: array_in
-    real(wp), dimension(d3, d1, d2), intent(out) :: array_out
-
-    integer :: i1, i2, i3
-
-    !$acc parallel loop collapse(3) &
-    !$acc&     copyout(array_out(:d3,:d1,:d2)) &
-    !$acc&     copyin(array_in(:d1,:d2,:d3))
-    do i2 = 1, d2
-      do i1 = 1, d1
-        do i3 = 1, d3
-          array_out(i3,i1,i2) = array_in(i1,i2,i3)
-        end do
-      end do
-    end do
-  end subroutine reorder_123x312_kernel
-  ! ----------------------------------------------------------------------------
-  subroutine reorder_123x321_kernel(d1, d2, d3, array_in, array_out) & 
-      bind(C, name="reorder_123x321_kernel")
-    integer,                         intent( in) :: d1, d2, d3
-    real(wp), dimension(d1, d2, d3), intent( in) :: array_in
-    real(wp), dimension(d3, d2, d1), intent(out) :: array_out
-
-    integer :: i1, i2, i3
-
-    !$acc parallel loop collapse(3) &
-    !$acc&     copyout(array_out(:d3,:d2,:d1)) &
-    !$acc&     copyin(array_in(:d1,:d2,:d3))
-    do i1 = 1, d1
-      do i2 = 1, d2
-        do i3 = 1, d3
-          array_out(i3,i2,i1) = array_in(i1,i2,i3)
-        end do
-      end do
-    end do
-  end subroutine reorder_123x321_kernel
-  ! ----------------------------------------------------------------------------
-end module mo_reorder_kernels
diff --git a/rrtmgp/kernels/mo_gas_optics_kernels.F90 b/rrtmgp/kernels/mo_gas_optics_kernels.F90
index ce1e0f592..9b9f99e7e 100644
--- a/rrtmgp/kernels/mo_gas_optics_kernels.F90
+++ b/rrtmgp/kernels/mo_gas_optics_kernels.F90
@@ -17,10 +17,6 @@
 module mo_gas_optics_kernels
   use mo_rte_kind,      only : wp, wl
   implicit none
-
-  interface zero_array
-    module procedure zero_array_3D, zero_array_4D
-  end interface
 contains
   ! --------------------------------------------------------------------------------------
   ! Compute interpolation coefficients
@@ -774,39 +770,4 @@ pure subroutine combine_and_reorder_nstr(ncol, nlay, ngpt, nmom, tau_abs, tau_ra
       end do
     end do
   end subroutine combine_and_reorder_nstr
-  ! ----------------------------------------------------------
-  pure subroutine zero_array_3D(ni, nj, nk, array) bind(C, name="zero_array_3D")
-    integer, intent(in) :: ni, nj, nk
-    real(wp), dimension(ni, nj, nk), intent(out) :: array
-    ! -----------------------
-    integer :: i,j,k
-    ! -----------------------
-    do k = 1, nk
-      do j = 1, nj
-        do i = 1, ni
-          array(i,j,k) = 0.0_wp
-        end do
-      end do
-    end do
-
-  end subroutine zero_array_3D
-  ! ----------------------------------------------------------
-  pure subroutine zero_array_4D(ni, nj, nk, nl, array) bind(C, name="zero_array_4D")
-    integer, intent(in) :: ni, nj, nk, nl
-    real(wp), dimension(ni, nj, nk, nl), intent(out) :: array
-    ! -----------------------
-    integer :: i,j,k,l
-    ! -----------------------
-    do l = 1, nl
-      do k = 1, nk
-        do j = 1, nj
-          do i = 1, ni
-            array(i,j,k,l) = 0.0_wp
-          end do
-        end do
-      end do
-    end do
-
-  end subroutine zero_array_4D
-  ! ----------------------------------------------------------
 end module mo_gas_optics_kernels
diff --git a/rrtmgp/kernels/mo_reorder_kernels.F90 b/rrtmgp/kernels/mo_reorder_kernels.F90
index c8d06c88c..ab683ac09 100644
--- a/rrtmgp/kernels/mo_reorder_kernels.F90
+++ b/rrtmgp/kernels/mo_reorder_kernels.F90
@@ -26,6 +26,9 @@ subroutine reorder_123x312_kernel(d1, d2, d3, array_in, array_out) &
 
     integer :: i1, i2, i3
 
+    !$acc parallel loop collapse(3) &
+    !$acc&     copyout(array_out(:d3,:d1,:d2)) &
+    !$acc&     copyin(array_in(:d1,:d2,:d3))
     do i2 = 1, d2
       do i1 = 1, d1
         do i3 = 1, d3
@@ -43,6 +46,9 @@ subroutine reorder_123x321_kernel(d1, d2, d3, array_in, array_out) &
 
     integer :: i1, i2, i3
 
+    !$acc parallel loop collapse(3) &
+    !$acc&     copyout(array_out(:d3,:d2,:d1)) &
+    !$acc&     copyin(array_in(:d1,:d2,:d3))
     do i1 = 1, d1
       do i2 = 1, d2
         do i3 = 1, d3
diff --git a/rrtmgp/mo_gas_optics_rrtmgp.F90 b/rrtmgp/mo_gas_optics_rrtmgp.F90
index 653f66a33..e33ee4efa 100644
--- a/rrtmgp/mo_gas_optics_rrtmgp.F90
+++ b/rrtmgp/mo_gas_optics_rrtmgp.F90
@@ -23,12 +23,12 @@
 module mo_gas_optics_rrtmgp
   use mo_rte_kind,           only: wp, wl
   use mo_rrtmgp_constants,   only: avogad, m_dry, m_h2o, grav
-  use mo_util_array,         only: any_vals_less_than, any_vals_outside
+  use mo_util_array,         only: zero_array, any_vals_less_than, any_vals_outside
   use mo_optical_props,      only: ty_optical_props
   use mo_source_functions,   only: ty_source_func_lw
   use mo_gas_optics_kernels, only: interpolation,                                                       &
                                    compute_tau_absorption, compute_tau_rayleigh, compute_Planck_source, &
-                                   combine_and_reorder_2str, combine_and_reorder_nstr, zero_array
+                                   combine_and_reorder_2str, combine_and_reorder_nstr
 
   use mo_util_string,        only: lower_case, string_in_array, string_loc_in_array
   use mo_gas_concentrations, only: ty_gas_concs
@@ -326,6 +326,7 @@ function gas_optics_ext(this,                         &
     integer,     dimension(2,    get_nflav(this),size(play,dim=1), size(play,dim=2)) :: jeta
 
     integer :: ncol, nlay, ngpt, nband, ngas, nflav
+    integer :: igpt, icol
     ! ----------------------------------------------------------
     ncol  = size(play,dim=1)
     nlay  = size(play,dim=2)
@@ -352,8 +353,12 @@ function gas_optics_ext(this,                         &
     !
     error_msg = check_extent(toa_src,     ncol,         ngpt, 'toa_src')
     if(error_msg  /= '') return
-    toa_src(:,:) = spread(this%solar_src(:), dim=1, ncopies=ncol)
-
+    !$acc parallel loop collapse(2)
+    do igpt = 1,ngpt
+       do icol = 1,ncol
+          toa_src(icol,igpt) = this%solar_src(igpt)
+       end do
+    end do
   end function gas_optics_ext
   !------------------------------------------------------------------------------------------
   !
@@ -494,12 +499,10 @@ function compute_gas_taus(this,                       &
     !
     !$acc enter data create(jtemp, jpress, jeta, tropo, fmajor)
     !$acc enter data create(tau, tau_rayleigh)
-    !$acc enter data copyin(play, tlay, col_gas)
     !$acc enter data create(col_mix, fminor)
+    !$acc enter data copyin(play, tlay, col_gas)
     !$acc enter data copyin(this)
-    !$acc enter data copyin(this%flavor, this%press_ref_log, this%vmr_ref, this%gpoint_flavor)
-    !$acc enter data copyin(this%temp_ref)  ! this one causes problems
-    !$acc enter data copyin(this%kminor_lower, this%kminor_upper)
+    !$acc enter data copyin(this%gpoint_flavor)
     call zero_array(ngpt, nlay, ncol, tau)
     call interpolation(               &
             ncol,nlay,                &        ! problem dimensions
@@ -565,12 +568,11 @@ function compute_gas_taus(this,                       &
 
     ! Combine optical depths and reorder for radiative transfer solver.
     call combine_and_reorder(tau, tau_rayleigh, allocated(this%krayl), optical_props)
-    !$acc exit data copyout(jtemp, jpress, jeta, tropo, fmajor)
     !$acc exit data delete(tau, tau_rayleigh)
-    !$acc exit data delete(play, tlay, col_gas, col_mix, fminor)
-    !$acc exit data delete(this%flavor, this%press_ref_log, this%vmr_ref, this%gpoint_flavor)
-    !!!$acc exit data delete(this%temp_ref)  ! this one causes problems
-    !!!$acc exit data delete(this%kminor_lower, this%kminor_upper)
+    !$acc exit data delete(play, tlay, col_gas)
+    !$acc exit data delete(col_mix, fminor)
+    !$acc exit data delete(this%gpoint_flavor)
+    !$acc exit data copyout(jtemp, jpress, jeta, tropo, fmajor)
   end function compute_gas_taus
   !------------------------------------------------------------------------------------------
   !
@@ -585,24 +587,24 @@ function source(this,                               &
                   result(error_msg)
     ! inputs
     class(ty_gas_optics_rrtmgp),    intent(in ) :: this
-    integer,                               intent(in ) :: ncol, nlay, nbnd, ngpt
-    real(wp), dimension(ncol,nlay),        intent(in ) :: play   ! layer pressures [Pa, mb]
-    real(wp), dimension(ncol,nlay+1),      intent(in ) :: plev   ! level pressures [Pa, mb]
-    real(wp), dimension(ncol,nlay),        intent(in ) :: tlay   ! layer temperatures [K]
-    real(wp), dimension(ncol),             intent(in ) :: tsfc   ! surface skin temperatures [K]
+    integer,                               intent(in   ) :: ncol, nlay, nbnd, ngpt
+    real(wp), dimension(ncol,nlay),        intent(in   ) :: play   ! layer pressures [Pa, mb]
+    real(wp), dimension(ncol,nlay+1),      intent(in   ) :: plev   ! level pressures [Pa, mb]
+    real(wp), dimension(ncol,nlay),        intent(in   ) :: tlay   ! layer temperatures [K]
+    real(wp), dimension(ncol),             intent(in   ) :: tsfc   ! surface skin temperatures [K]
     ! Interplation coefficients
-    integer,     dimension(ncol,nlay),     intent(in ) :: jtemp, jpress
-    logical(wl), dimension(ncol,nlay),     intent(in ) :: tropo
+    integer,     dimension(ncol,nlay),     intent(in   ) :: jtemp, jpress
+    logical(wl), dimension(ncol,nlay),     intent(in   ) :: tropo
     real(wp),    dimension(2,2,2,get_nflav(this),ncol,nlay),  &
-                                           intent(in ) :: fmajor
+                                           intent(in   ) :: fmajor
     integer,     dimension(2,    get_nflav(this),ncol,nlay),  &
-                                           intent(in ) :: jeta
+                                           intent(in   ) :: jeta
     class(ty_source_func_lw    ),          intent(inout) :: sources
-    real(wp), dimension(ncol,nlay+1),      intent(in ), &
+    real(wp), dimension(ncol,nlay+1),      intent(in   ), &
                                       optional, target :: tlev          ! level temperatures [K]
     character(len=128)                                 :: error_msg
     ! ----------------------------------------------------------
-    integer                                      :: icol, ilay
+    integer                                      :: icol, ilay, igpt
     real(wp), dimension(ngpt,nlay,ncol)          :: lay_source_t, lev_source_inc_t, lev_source_dec_t
     real(wp), dimension(ngpt,     ncol)          :: sfc_source_t
     ! Variables for temperature at layer edges [K] (ncol, nlay+1)
@@ -644,6 +646,9 @@ function source(this,                               &
     !-------------------------------------------------------------------
     ! Compute internal (Planck) source functions at layers and levels,
     !  which depend on mapping from spectral space that creates k-distribution.
+    !$acc enter data copyin(sources)
+    !$acc enter data create(sources%lay_source, sources%lev_source_inc, sources%lev_source_dec, sources%sfc_source)
+    !$acc enter data create(sfc_source_t, lay_source_t, lev_source_inc_t, lev_source_dec_t) attach(tlev_wk)
     call compute_Planck_source(ncol, nlay, nbnd, ngpt, &
                 get_nflav(this), this%get_neta(), this%get_npres(), this%get_ntemp(), this%get_nPlanckTemp(), &
                 tlay, tlev_wk, tsfc, merge(1,nlay,play(1,1) > play(1,nlay)), &
@@ -651,10 +656,18 @@ function source(this,                               &
                 this%get_gpoint_bands(), this%get_band_lims_gpoint(), this%planck_frac, this%temp_ref_min,&
                 this%totplnk_delta, this%totplnk, this%gpoint_flavor,  &
                 sfc_source_t, lay_source_t, lev_source_inc_t, lev_source_dec_t)
-    sources%sfc_source     = transpose(sfc_source_t)
+    !$acc parallel loop collapse(2)
+    do igpt = 1, ngpt
+      do icol = 1, ncol
+        sources%sfc_source(icol,igpt) = sfc_source_t(igpt,icol)
+      end do
+    end do
     call reorder123x321(lay_source_t, sources%lay_source)
     call reorder123x321(lev_source_inc_t, sources%lev_source_inc)
     call reorder123x321(lev_source_dec_t, sources%lev_source_dec)
+    !$acc exit data delete(sfc_source_t, lay_source_t, lev_source_inc_t, lev_source_dec_t) detach(tlev_wk)
+    !$acc exit data copyout(sources%lay_source, sources%lev_source_inc, sources%lev_source_dec, sources%sfc_source)
+    !$acc exit data copyout(sources)
   end function source
   !--------------------------------------------------------------------------------------------------------------------
   !
@@ -1535,7 +1548,7 @@ subroutine combine_and_reorder(tau, tau_rayleigh, has_rayleigh, optical_props)
     ncol = size(tau, 3)
     nlay = size(tau, 2)
     ngpt = size(tau, 1)
-
+    !$acc enter data copyin(optical_props)
     if (.not. has_rayleigh) then
       ! index reorder (ngpt, nlay, ncol) -> (ncol,nlay,gpt)
       !$acc enter data copyin(tau)
@@ -1579,6 +1592,7 @@ subroutine combine_and_reorder(tau, tau_rayleigh, has_rayleigh, optical_props)
       end select
       !$acc exit data delete(tau, tau_rayleigh)
     end if
+    !$acc exit data copyout(optical_props)
   end subroutine combine_and_reorder
 
   !--------------------------------------------------------------------------------------------------------------------
diff --git a/rte/Make.depends b/rte/Make.depends
index 0f0aeee3e..13123d913 100644
--- a/rte/Make.depends
+++ b/rte/Make.depends
@@ -36,6 +36,7 @@ mo_fluxes.o:                    mo_rte_kind.o mo_fluxes_broadband_kernels.o   mo
 mo_rte_solver_kernels.o:        mo_rte_kind.o mo_rte_solver_kernels.F90
 
 mo_rte_lw.o: mo_rte_kind.o \
+  mo_util_array.o \
 	mo_optical_props.o  \
   mo_source_functions.o \
 	mo_fluxes.o \
@@ -43,6 +44,7 @@ mo_rte_lw.o: mo_rte_kind.o \
 	mo_rte_lw.F90
 
 mo_rte_sw.o: mo_rte_kind.o \
+  mo_util_array.o \
 	mo_optical_props.o  \
   mo_source_functions.o \
 	mo_fluxes.o \
diff --git a/rte/kernels-openacc/mo_fluxes_broadband_kernels.F90 b/rte/kernels-openacc/mo_fluxes_broadband_kernels.F90
deleted file mode 100644
index a698fe0c0..000000000
--- a/rte/kernels-openacc/mo_fluxes_broadband_kernels.F90
+++ /dev/null
@@ -1,109 +0,0 @@
-! This code is part of Radiative Transfer for Energetics (RTE)
-!
-! Eli Mlawer and Robert Pincus
-! email:  rrtmgp@aer.com
-!
-! Copyright 2015-2018,  Atmospheric and Environmental Research and
-! Regents of the University of Colorado.  All right reserved.
-!
-! Use and duplication is permitted under the terms of the
-!    BSD 3-clause license, see http://opensource.org/licenses/BSD-3-Clause
-! -------------------------------------------------------------------------------------------------
-!
-! Kernels for computing broadband fluxes by summing over all elements in the spectral dimension
-!
-! -------------------------------------------------------------------------------------------------
-module mo_fluxes_broadband_kernels
-  use, intrinsic :: iso_c_binding
-  use mo_rte_kind, only: wp
-  implicit none
-  private
-  public :: sum_broadband, net_broadband
-
-  interface net_broadband
-    module procedure net_broadband_full, net_broadband_precalc
-  end interface net_broadband
-contains
-  ! ----------------------------------------------------------------------------
-  !
-  ! Spectral reduction over all points
-  !
-  subroutine sum_broadband(ncol, nlev, ngpt, spectral_flux, broadband_flux) bind (C, name="sum_broadband")
-    integer,                               intent(in ) :: ncol, nlev, ngpt
-    real(wp), dimension(ncol, nlev, ngpt), intent(in ) :: spectral_flux
-    real(wp), dimension(ncol, nlev),       intent(out) :: broadband_flux
-
-    integer  :: icol, ilev, igpt
-    real(wp) :: total
-    !$acc enter data copyin(spectral_flux) create(broadband_flux)
-    !$acc parallel loop collapse(2)
-    do ilev = 1, nlev
-      do icol = 1, ncol
-        broadband_flux(icol,ilev) = 0._wp
-      end do
-    end do
-
-    !$acc parallel loop collapse(3)
-    do ilev = 1, nlev
-      do icol = 1, ncol
-        do igpt = 1, ngpt
-          !$acc atomic update
-          broadband_flux(icol,ilev) = broadband_flux(icol,ilev) + spectral_flux(icol, ilev, igpt)
-        end do
-      end do
-    end do
-    !$acc exit data delete(spectral_flux) copyout(broadband_flux)
-  end subroutine sum_broadband
-  ! ----------------------------------------------------------------------------
-  !
-  ! Net flux: Spectral reduction over all points
-  !
-  subroutine net_broadband_full(ncol, nlev, ngpt, spectral_flux_dn, spectral_flux_up, broadband_flux_net) &
-    bind(C, name="net_broadband_full")
-    integer,                               intent(in ) :: ncol, nlev, ngpt
-    real(wp), dimension(ncol, nlev, ngpt), intent(in ) :: spectral_flux_dn, spectral_flux_up
-    real(wp), dimension(ncol, nlev),       intent(out) :: broadband_flux_net
-    integer  :: icol, ilev, igpt
-    real(wp) :: total, tmp
-
-    !$acc enter data copyin(spectral_flux_dn, spectral_flux_up) create(broadband_flux_net)
-    !$acc parallel loop collapse(2)
-    do ilev = 1, nlev
-      do icol = 1, ncol
-        broadband_flux_net(icol,ilev) = 0._wp
-      end do
-    end do
-
-    !$acc parallel loop collapse(3)
-    do ilev = 1, nlev
-      do icol = 1, ncol
-        do igpt = 1, ngpt
-          tmp = spectral_flux_dn(icol, ilev, igpt) - spectral_flux_up(icol, ilev, igpt)
-          !$acc atomic update
-          broadband_flux_net(icol,ilev) = broadband_flux_net(icol,ilev) + tmp
-        end do
-      end do
-    end do
-    !$acc exit data delete(spectral_flux_dn, spectral_flux_up) copyout(broadband_flux_net)
-  end subroutine net_broadband_full
-  ! ----------------------------------------------------------------------------
-  !
-  ! Net flux when bradband flux up and down are already available
-  !
-  subroutine net_broadband_precalc(ncol, nlay, flux_dn, flux_up, broadband_flux_net) &
-    bind (C, name="net_broadband_precalc")
-    integer,                         intent(in ) :: ncol, nlay
-    real(wp), dimension(ncol, nlay), intent(in ) :: flux_dn, flux_up
-    real(wp), dimension(ncol, nlay), intent(out) :: broadband_flux_net
-    integer :: icol, ilay
-    !$acc enter data copyin(flux_dn, flux_up) create(broadband_flux_net)
-    !$acc parallel loop collapse(2)
-    do ilay = 1, nlay
-      do icol = 1, ncol
-         broadband_flux_net(icol,ilay) = flux_dn(icol,ilay) - flux_up(icol,ilay)
-       end do
-    end do
-    !$acc exit data delete(flux_dn, flux_up) copyout(broadband_flux_net)
-  end subroutine net_broadband_precalc
-  ! ----------------------------------------------------------------------------
-end module mo_fluxes_broadband_kernels
diff --git a/rte/kernels-openacc/mo_rte_solver_kernels.F90 b/rte/kernels-openacc/mo_rte_solver_kernels.F90
index 4f5659f15..da7e0d307 100644
--- a/rte/kernels-openacc/mo_rte_solver_kernels.F90
+++ b/rte/kernels-openacc/mo_rte_solver_kernels.F90
@@ -106,15 +106,15 @@ subroutine lw_solver_noscat(ncol, nlay, ngpt, top_at_1, D, weight,
       lev_source_dn => lev_source_dec
     end if
 
-    !$acc enter data copyin(d,tau,sfc_src,sfc_emis,lev_source_dec,lev_source_inc,lay_source,radn_dn) async
-    !$acc enter data create(tau_loc,trans,source_dn,source_up,source_sfc,sfc_albedo,radn_up) async
-    !$acc enter data attach(lev_source_up,lev_source_dn) async
+    !$acc enter data copyin(d,tau,sfc_src,sfc_emis,lev_source_dec,lev_source_inc,lay_source,radn_dn)
+    !$acc enter data create(tau_loc,trans,source_dn,source_up,source_sfc,sfc_albedo,radn_up)
+    !$acc enter data attach(lev_source_up,lev_source_dn)
 
     ! NOTE: This kernel produces small differences between GPU and CPU
     ! implementations on Ascent with PGI, we assume due to floating point
     ! differences in the exp() function. These differences are small in the
     ! RFMIP test case (10^-6).
-    !$acc parallel loop collapse(3) async
+    !$acc parallel loop collapse(3)
     do igpt = 1, ngpt
       do ilev = 1, nlay
         do icol = 1, ncol
@@ -127,7 +127,7 @@ subroutine lw_solver_noscat(ncol, nlay, ngpt, top_at_1, D, weight,
       end do
     end do
 
-    !$acc parallel loop collapse(2) async
+    !$acc parallel loop collapse(2)
     do igpt = 1, ngpt
       do icol = 1, ncol
         !
@@ -160,7 +160,7 @@ subroutine lw_solver_noscat(ncol, nlay, ngpt, top_at_1, D, weight,
     !
     ! Convert intensity to flux assuming azimuthal isotropy and quadrature weight
     !
-    !$acc parallel loop collapse(3) async
+    !$acc parallel loop collapse(3)
     do igpt = 1, ngpt
       do ilev = 1, nlay+1
         do icol = 1, ncol
@@ -170,9 +170,9 @@ subroutine lw_solver_noscat(ncol, nlay, ngpt, top_at_1, D, weight,
       end do
     end do
 
-    !$acc exit data copyout(radn_dn,radn_up) async
-    !$acc exit data delete(d,tau,sfc_src,sfc_emis,lev_source_dec,lev_source_inc,lay_source,tau_loc,trans,source_dn,source_up,source_sfc,sfc_albedo) async
-    !$acc exit data detach(lev_source_up,lev_source_dn) async
+    !$acc exit data copyout(radn_dn,radn_up)
+    !$acc exit data delete(d,tau,sfc_src,sfc_emis,lev_source_dec,lev_source_inc,lay_source,tau_loc,trans,source_dn,source_up,source_sfc,sfc_albedo)
+    !$acc exit data detach(lev_source_up,lev_source_dn)
 
   end subroutine lw_solver_noscat
   ! ---------------------------------------------------------------
@@ -209,12 +209,12 @@ subroutine lw_solver_noscat_GaussQuad(ncol, nlay, ngpt, top_at_1, nmus, Ds, weig
     integer :: imu, top_level
     integer :: icol, ilev, igpt
 
-    !$acc enter data copyin(Ds,weights,tau,lay_source,lev_source_inc,lev_source_dec,sfc_emis,sfc_src,flux_dn) async
-    !$acc enter data create(flux_up,radn_dn,radn_up,Ds_ncol) async
+    !$acc enter data copyin(Ds,weights,tau,lay_source,lev_source_inc,lev_source_dec,sfc_emis,sfc_src,flux_dn)
+    !$acc enter data create(flux_up,radn_dn,radn_up,Ds_ncol,flux_top)
 
     ! ------------------------------------
     ! ------------------------------------
-    !$acc  parallel loop collapse(2) async
+    !$acc  parallel loop collapse(2)
     do igpt = 1, ngpt
       do icol = 1, ncol
         Ds_ncol(icol, igpt) = Ds(1)
@@ -229,7 +229,7 @@ subroutine lw_solver_noscat_GaussQuad(ncol, nlay, ngpt, top_at_1, nmus, Ds, weig
     ! For more than one angle use local arrays
     !
     top_level = MERGE(1, nlay+1, top_at_1)
-    !$acc parallel loop collapse(2) async
+    !$acc parallel loop collapse(2)
     do igpt = 1,ngpt
       do icol = 1,ncol
         flux_top(icol,igpt) = flux_dn(icol,top_level,igpt)
@@ -238,7 +238,7 @@ subroutine lw_solver_noscat_GaussQuad(ncol, nlay, ngpt, top_at_1, nmus, Ds, weig
     call apply_BC(ncol, nlay, ngpt, top_at_1, flux_top, radn_dn)
 
     do imu = 2, nmus
-      !$acc  parallel loop collapse(2) async
+      !$acc  parallel loop collapse(2)
       do igpt = 1, ngpt
         do icol = 1, ncol
           Ds_ncol(icol, igpt) = Ds(imu)
@@ -248,7 +248,7 @@ subroutine lw_solver_noscat_GaussQuad(ncol, nlay, ngpt, top_at_1, nmus, Ds, weig
                             top_at_1, Ds_ncol, weights(imu), tau, &
                             lay_source, lev_source_inc, lev_source_dec, sfc_emis, sfc_src, &
                             radn_up, radn_dn)
-      !$acc  parallel loop collapse(3) async
+      !$acc  parallel loop collapse(3)
       do igpt = 1, ngpt
         do ilev = 1, nlay+1
           do icol = 1, ncol
@@ -260,10 +260,8 @@ subroutine lw_solver_noscat_GaussQuad(ncol, nlay, ngpt, top_at_1, nmus, Ds, weig
 
     end do ! imu loop
 
-    !$acc exit data copyout(flux_up,flux_dn) async
-    !$acc exit data delete(Ds,weights,tau,lay_source,lev_source_inc,lev_source_dec,sfc_emis,sfc_src,radn_dn,radn_up,Ds_ncol) async
-
-    !$acc wait
+    !$acc exit data copyout(flux_up,flux_dn)
+    !$acc exit data delete(Ds,weights,tau,lay_source,lev_source_inc,lev_source_dec,sfc_emis,sfc_src,radn_dn,radn_up,Ds_ncol,flux_top)
   end subroutine lw_solver_noscat_GaussQuad
   ! -------------------------------------------------------------------------------------------------
   !
@@ -494,7 +492,7 @@ subroutine lw_source_noscat(ncol, nlay, ngpt, lay_source, lev_source_up, lev_sou
       real(wp), parameter :: tau_thresh = sqrt(epsilon(tau))
       ! ---------------------------------------------------------------
       ! ---------------------------------------------------------------
-      !$acc  parallel loop collapse(3) async
+      !$acc  parallel loop collapse(3)
       do igpt = 1, ngpt
         do ilay = 1, nlay
           do icol = 1, ncol
@@ -545,7 +543,7 @@ subroutine lw_transport_noscat(ncol, nlay, ngpt, top_at_1, &
       !
       ! Top of domain is index 1
       !
-      !$acc  parallel loop collapse(2) async
+      !$acc  parallel loop collapse(2)
       do igpt = 1, ngpt
         do icol = 1, ncol
           ! Downward propagation
@@ -566,7 +564,7 @@ subroutine lw_transport_noscat(ncol, nlay, ngpt, top_at_1, &
       !
       ! Top of domain is index nlay+1
       !
-      !$acc  parallel loop collapse(2) async
+      !$acc  parallel loop collapse(2)
       do igpt = 1, ngpt
         do icol = 1, ncol
           ! Downward propagation
@@ -1104,21 +1102,20 @@ subroutine apply_BC_gpt(ncol, nlay, ngpt, top_at_1, inc_flux, flux_dn) bind (C,
     ! --------------
     !   Upper boundary condition
     if(top_at_1) then
-      !$acc  parallel loop collapse(2) async
+      !$acc  parallel loop collapse(2)
       do igpt = 1, ngpt
         do icol = 1, ncol
           flux_dn(icol,      1, igpt)  = inc_flux(icol,igpt)
         end do
       end do
     else
-      !$acc  parallel loop collapse(2) async
+      !$acc  parallel loop collapse(2)
       do igpt = 1, ngpt
         do icol = 1, ncol
           flux_dn(icol, nlay+1, igpt)  = inc_flux(icol,igpt)
         end do
       end do
     end if
-    !$acc wait
   end subroutine apply_BC_gpt
   ! ---------------------
   subroutine apply_BC_factor(ncol, nlay, ngpt, top_at_1, inc_flux, factor, flux_dn) bind (C, name="apply_BC_factor")
@@ -1134,21 +1131,20 @@ subroutine apply_BC_factor(ncol, nlay, ngpt, top_at_1, inc_flux, factor, flux_dn
 
     !   Upper boundary condition
     if(top_at_1) then
-      !$acc  parallel loop collapse(2) async
+      !$acc  parallel loop collapse(2)
       do igpt = 1, ngpt
         do icol = 1, ncol
           flux_dn(icol,      1, igpt)  = inc_flux(icol,igpt) * factor(icol)
         end do
       end do
     else
-      !$acc  parallel loop collapse(2) async
+      !$acc  parallel loop collapse(2)
       do igpt = 1, ngpt
         do icol = 1, ncol
           flux_dn(icol, nlay+1, igpt)  = inc_flux(icol,igpt) * factor(icol)
         end do
       end do
     end if
-    !$acc wait
   end subroutine apply_BC_factor
   ! ---------------------
   subroutine apply_BC_0(ncol, nlay, ngpt, top_at_1, flux_dn) bind (C, name="apply_BC_0")
@@ -1162,21 +1158,20 @@ subroutine apply_BC_0(ncol, nlay, ngpt, top_at_1, flux_dn) bind (C, name="apply_
 
     !   Upper boundary condition
     if(top_at_1) then
-      !$acc  parallel loop collapse(2) async
+      !$acc  parallel loop collapse(2)
       do igpt = 1, ngpt
         do icol = 1, ncol
           flux_dn(icol,      1, igpt)  = 0._wp
         end do
       end do
     else
-      !$acc  parallel loop collapse(2) async
+      !$acc  parallel loop collapse(2)
       do igpt = 1, ngpt
         do icol = 1, ncol
           flux_dn(icol, nlay+1, igpt)  = 0._wp
         end do
       end do
     end if
-    !$acc wait
   end subroutine apply_BC_0
 
 end module mo_rte_solver_kernels
diff --git a/rte/kernels-openacc/mo_util_array.F90 b/rte/kernels-openacc/mo_util_array.F90
deleted file mode 100644
index 37efadd37..000000000
--- a/rte/kernels-openacc/mo_util_array.F90
+++ /dev/null
@@ -1,73 +0,0 @@
-! This code is part of Radiative Transfer for Energetics (RTE)
-!
-! Contacts: Robert Pincus and Eli Mlawer
-! email:  rrtmgp@aer.com
-!
-! Copyright 2015-2019,  Atmospheric and Environmental Research and
-! Regents of the University of Colorado.  All right reserved.
-!
-! Use and duplication is permitted under the terms of the
-!    BSD 3-clause license, see http://opensource.org/licenses/BSD-3-Clause
-! -------------------------------------------------------------------------------------------------
-module mo_util_array
-!
-! This module provide utilites for sanitizing input arrays:
-!    checking values and sizes
-! These are in a module so code can be written for both CPUs and GPUs
-! Used only by Fortran classes so routines don't need C bindings and can use assumed-shape
-! Currently only for 3D arrays; could extend through overloading to other ranks
-!
-!
-  use mo_rte_kind,      only: wp
-  implicit none
-  private
-  public :: any_vals_less_than, any_vals_outside
-contains
-!-------------------------------------------------------------------------------------------------
-  logical function any_vals_less_than(array, minVal)
-    real(wp), dimension(:,:,:), intent(in) :: array
-    real(wp),                   intent(in) :: minVal
-
-    real(wp) :: minValue
-    integer  :: i, j, k
-
-    ! This could be written far more compactly as
-    !       any_vals_less_than = any(array < minVal)
-    ! but an explicit loop also works on GPUs
-    minValue = minVal
-    !$acc parallel loop collapse(3) copyin(array) reduction(min:minValue)
-    do k = 1, size(array,3)
-      do j = 1, size(array,2)
-        do i = 1, size(array,1)
-          minValue = min(array(i,j,k), minValue)
-        end do
-      end do
-    end do
-    any_vals_less_than = (minValue < minVal)
-  end function any_vals_less_than
-  ! ---------------------------------
-  logical function any_vals_outside(array, minVal, maxVal)
-    real(wp), dimension(:,:,:), intent(in) :: array
-    real(wp),                   intent(in) :: minVal, maxVal
-
-    real(wp) :: minValue, maxValue
-    integer  :: i, j, k
-
-    ! This could be written far more compactly as
-    !   any_vals_outside = any(array < minVal .or. array > maxVal)
-    ! but an explicit loop also works on GPUs
-    minValue = minVal
-    maxValue = maxVal
-    !$acc parallel loop collapse(3) copyin(array) reduction(min:minValue) reduction(max:maxValue)
-    do k = 1, size(array,3)
-      do j = 1, size(array,2)
-        do i = 1, size(array,1)
-          minValue = min(array(i,j,k), minValue)
-          maxValue = max(array(i,j,k), minValue)
-        end do
-      end do
-    end do
-    any_vals_outside = (minValue < minVal .or. maxValue > maxVal)
-  end function any_vals_outside
-! ---------------------------------
-end module mo_util_array
diff --git a/rte/kernels/mo_fluxes_broadband_kernels.F90 b/rte/kernels/mo_fluxes_broadband_kernels.F90
index b8206e0e8..cfc3bb7a0 100644
--- a/rte/kernels/mo_fluxes_broadband_kernels.F90
+++ b/rte/kernels/mo_fluxes_broadband_kernels.F90
@@ -35,18 +35,23 @@ pure subroutine sum_broadband(ncol, nlev, ngpt, spectral_flux, broadband_flux) b
 
     integer :: icol, ilev, igpt
 
+    !$acc enter data copyin(spectral_flux) create(broadband_flux)
+    !$acc parallel loop collapse(2)
     do ilev = 1, nlev
       do icol = 1, ncol
         broadband_flux(icol, ilev) =  spectral_flux(icol, ilev, 1)
       end do
     end do
+    !$acc parallel loop collapse(3)
     do igpt = 2, ngpt
       do ilev = 1, nlev
         do icol = 1, ncol
+          !$acc atomic update
           broadband_flux(icol, ilev) = broadband_flux(icol, ilev) + spectral_flux(icol, ilev, igpt)
         end do
       end do
     end do
+    !$acc exit data delete(spectral_flux) copyout(broadband_flux)
   end subroutine sum_broadband
   ! ----------------------------------------------------------------------------
   !
@@ -58,33 +63,48 @@ pure subroutine net_broadband_full(ncol, nlev, ngpt, spectral_flux_dn, spectral_
     real(wp), dimension(ncol, nlev, ngpt), intent(in ) :: spectral_flux_dn, spectral_flux_up
     real(wp), dimension(ncol, nlev),       intent(out) :: broadband_flux_net
 
-    integer :: icol, ilev, igpt
+    integer  :: icol, ilev, igpt
+    real(wp) :: diff
 
+    !$acc enter data copyin(spectral_flux_dn, spectral_flux_up) create(broadband_flux_net)
+    !$acc parallel loop collapse(2)
     do ilev = 1, nlev
       do icol = 1, ncol
-        broadband_flux_net(icol, ilev) = spectral_flux_dn(icol, ilev, 1   ) - spectral_flux_up(icol, ilev, 1)
+        diff = spectral_flux_dn(icol, ilev, 1   ) - spectral_flux_up(icol, ilev,     1)
+        broadband_flux_net(icol, ilev) = diff
       end do
     end do
+    !$acc parallel loop collapse(3)
     do igpt = 2, ngpt
       do ilev = 1, nlev
         do icol = 1, ncol
-          broadband_flux_net(icol, ilev) = broadband_flux_net(icol, ilev) + &
-                                         spectral_flux_dn(icol, ilev, igpt) - spectral_flux_up(icol, ilev, igpt)
+          diff = spectral_flux_dn(icol, ilev, igpt) - spectral_flux_up(icol, ilev, igpt)
+          !$acc atomic update
+          broadband_flux_net(icol, ilev) = broadband_flux_net(icol, ilev) + diff
         end do
       end do
     end do
+    !$acc exit data delete(spectral_flux_dn, spectral_flux_up) copyout(broadband_flux_net)
   end subroutine net_broadband_full
   ! ----------------------------------------------------------------------------
   !
   ! Net flux when bradband flux up and down are already available
   !
-  pure subroutine net_broadband_precalc(ncol, nlay, flux_dn, flux_up, broadband_flux_net) &
+  pure subroutine net_broadband_precalc(ncol, nlev, flux_dn, flux_up, broadband_flux_net) &
     bind(C, name="net_broadband_precalc")
-    integer,                         intent(in ) :: ncol, nlay
-    real(wp), dimension(ncol, nlay), intent(in ) :: flux_dn, flux_up
-    real(wp), dimension(ncol, nlay), intent(out) :: broadband_flux_net
+    integer,                         intent(in ) :: ncol, nlev
+    real(wp), dimension(ncol, nlev), intent(in ) :: flux_dn, flux_up
+    real(wp), dimension(ncol, nlev), intent(out) :: broadband_flux_net
 
-    broadband_flux_net(1:ncol,1:nlay) = flux_dn(1:ncol,1:nlay) - flux_up(1:ncol,1:nlay)
+    integer  :: icol, ilev
+    !$acc enter data copyin(flux_dn, flux_up) create(broadband_flux_net)
+    !$acc parallel loop collapse(2)
+    do ilev = 1, nlev
+      do icol = 1, ncol
+         broadband_flux_net(icol,ilev) = flux_dn(icol,ilev) - flux_up(icol,ilev)
+       end do
+    end do
+    !$acc exit data delete(flux_dn, flux_up) copyout(broadband_flux_net)
   end subroutine net_broadband_precalc
   ! ----------------------------------------------------------------------------
 end module mo_fluxes_broadband_kernels
diff --git a/rte/kernels/mo_util_array.F90 b/rte/kernels/mo_util_array.F90
deleted file mode 100644
index e9c374490..000000000
--- a/rte/kernels/mo_util_array.F90
+++ /dev/null
@@ -1,40 +0,0 @@
-! This code is part of Radiative Transfer for Energetics (RTE)
-!
-! Contacts: Robert Pincus and Eli Mlawer
-! email:  rrtmgp@aer.com
-!
-! Copyright 2015-2019,  Atmospheric and Environmental Research and
-! Regents of the University of Colorado.  All right reserved.
-!
-! Use and duplication is permitted under the terms of the
-!    BSD 3-clause license, see http://opensource.org/licenses/BSD-3-Clause
-! -------------------------------------------------------------------------------------------------
-module mo_util_array
-!
-! This module provide utilites for sanitizing input arrays:
-!    checking values and sizes
-! These are in a module so code can be written for both CPUs and GPUs
-! Used only by Fortran classes so routines don't need C bindings and can use assumed-shape
-! Currently only for 3D arrays; could extend through overloading to other ranks
-!
-  use mo_rte_kind,      only: wp
-  implicit none
-  private
-  public :: any_vals_less_than, any_vals_outside
-contains
-!-------------------------------------------------------------------------------------------------
-  logical function any_vals_less_than(array, minVal)
-    real(wp), dimension(:,:,:), intent(in) :: array
-    real(wp),                   intent(in) :: minVal
-
-    any_vals_less_than = any(array < minVal)
-  end function any_vals_less_than
-  ! ---------------------------------
-  logical function any_vals_outside(array, minVal, maxVal)
-    real(wp), dimension(:,:,:), intent(in) :: array
-    real(wp),                   intent(in) :: minVal, maxVal
-
-    any_vals_outside = any(array < minVal .or. array > maxVal)
-  end function any_vals_outside
-! ---------------------------------
-end module mo_util_array
diff --git a/rte/mo_rte_lw.F90 b/rte/mo_rte_lw.F90
index dc0b24d93..9439f0b73 100644
--- a/rte/mo_rte_lw.F90
+++ b/rte/mo_rte_lw.F90
@@ -35,6 +35,7 @@
 ! -------------------------------------------------------------------------------------------------
 module mo_rte_lw
   use mo_rte_kind,      only: wp, wl
+  use mo_util_array,    only: any_vals_less_than, any_vals_outside
   use mo_optical_props, only: ty_optical_props, &
                               ty_optical_props_arry, ty_optical_props_1scl, ty_optical_props_2str, ty_optical_props_nstr
   use mo_source_functions,   &
@@ -132,7 +133,7 @@ function rte_lw(optical_props, top_at_1, &
     !
     if(any([size(sfc_emis,1), size(sfc_emis,2)] /= [nband, ncol])) &
       error_msg = "rte_lw: sfc_emis inconsistently sized"
-    if(any(sfc_emis < 0._wp .or. sfc_emis > 1._wp)) &
+    if(any_vals_outside(sfc_emis, 0._wp, 1._wp)) &
       error_msg = "rte_lw: sfc_emis has values < 0 or > 1"
     if(len_trim(error_msg) > 0) return
 
@@ -142,7 +143,7 @@ function rte_lw(optical_props, top_at_1, &
     if(present(inc_flux)) then
       if(any([size(inc_flux,1), size(inc_flux,2)] /= [ncol, ngpt])) &
         error_msg = "rte_lw: inc_flux inconsistently sized"
-      if(any(inc_flux < 0._wp)) &
+      if(any_vals_less_than(inc_flux, 0._wp)) &
         error_msg = "rte_lw: inc_flux has values < 0"
     end if
     if(len_trim(error_msg) > 0) return
@@ -174,8 +175,8 @@ function rte_lw(optical_props, top_at_1, &
     !
     allocate(gpt_flux_up (ncol, nlay+1, ngpt), gpt_flux_dn(ncol, nlay+1, ngpt))
     allocate(sfc_emis_gpt(ncol,         ngpt))
-    !$acc enter data copyin(sources%lay_source, sources%lev_source_inc, sources%lev_source_dec, sources%sfc_source)
-    !$acc enter data copyin(gauss_Ds, gauss_wts)
+    !!$acc enter data copyin(sources, sources%lay_source, sources%lev_source_inc, sources%lev_source_dec, sources%sfc_source)
+    !$acc enter data copyin(optical_props)
     !$acc enter data create(gpt_flux_dn, gpt_flux_up)
     !$acc enter data create(sfc_emis_gpt)
     call expand_and_transpose(optical_props, sfc_emis, sfc_emis_gpt)
@@ -202,6 +203,8 @@ function rte_lw(optical_props, top_at_1, &
         ! No scattering two-stream calculation
         !
         !$acc enter data copyin(optical_props%tau)
+        error_msg =  optical_props%validate()
+        if(len_trim(error_msg) > 0) return
         call lw_solver_noscat_GaussQuad(ncol, nlay, ngpt, logical(top_at_1, wl), &
                               n_quad_angs, gauss_Ds(1:n_quad_angs,n_quad_angs), gauss_wts(1:n_quad_angs,n_quad_angs), &
                               optical_props%tau,                                                  &
@@ -214,6 +217,8 @@ function rte_lw(optical_props, top_at_1, &
         ! two-stream calculation with scattering
         !
         !$acc enter data copyin(optical_props%tau, optical_props%ssa, optical_props%g)
+        error_msg =  optical_props%validate()
+        if(len_trim(error_msg) > 0) return
         call lw_solver_2stream(ncol, nlay, ngpt, logical(top_at_1, wl), &
                                optical_props%tau, optical_props%ssa, optical_props%g,              &
                                sources%lay_source, sources%lev_source_inc, sources%lev_source_dec, &
@@ -232,9 +237,10 @@ function rte_lw(optical_props, top_at_1, &
     ! ...and reduce spectral fluxes to desired output quantities
     !
     error_msg = fluxes%reduce(gpt_flux_up, gpt_flux_dn, optical_props, top_at_1)
-    !$acc exit data delete(sources%lay_source, sources%lev_source_inc, sources%lev_source_dec, sources%sfc_source)
-    !$acc exit data delete(sfc_emis_gpt, gauss_Ds, gauss_wts)
+    !$acc exit data delete(sfc_emis_gpt)
     !$acc exit data delete(gpt_flux_up,gpt_flux_dn)
+    !$acc exit data delete(optical_props)
+    !!$acc exit data delete(sources%lay_source, sources%lev_source_inc, sources%lev_source_dec, sources%sfc_source,sources)
   end function rte_lw
   !--------------------------------------------------------------------------------------------------------------------
   !
diff --git a/rte/mo_rte_sw.F90 b/rte/mo_rte_sw.F90
index 0a5aec0bd..bb69a37d9 100644
--- a/rte/mo_rte_sw.F90
+++ b/rte/mo_rte_sw.F90
@@ -29,6 +29,7 @@
 ! -------------------------------------------------------------------------------------------------
 module mo_rte_sw
   use mo_rte_kind,      only: wp, wl
+  use mo_util_array,    only: any_vals_less_than, any_vals_outside
   use mo_optical_props, only: ty_optical_props, &
                               ty_optical_props_arry, ty_optical_props_1scl, ty_optical_props_2str, ty_optical_props_nstr
   use mo_fluxes,        only: ty_fluxes
@@ -87,35 +88,29 @@ function rte_sw(atmos, top_at_1,                 &
     !
     if(     size(mu0, 1)                                /=  ncol        ) &
       error_msg = "rte_sw: mu0 inconsistently sized"
-    if(any(mu0 <= 0._wp .or. mu0 > 1)) &
+    if(any_vals_outside(mu0, 0._wp, 1._wp)) &
       error_msg = "rte_sw: one or more mu0 <= 0 or > 1"
 
     if(any([size(inc_flux, 1),    size(inc_flux, 2)]    /= [ncol, ngpt])) &
       error_msg = "rte_sw: inc_flux inconsistently sized"
-    if(any(inc_flux <  0._wp)) &
+    if(any_vals_less_than(inc_flux, 0._wp)) &
       error_msg = "rte_sw: one or more inc_flux < 0"
     if(present(inc_flux_dif)) then
       if(any([size(inc_flux_dif, 1),    size(inc_flux_dif, 2)]    /= [ncol, ngpt])) &
         error_msg = "rte_sw: inc_flux_dif inconsistently sized"
-      if(any(inc_flux_dif <  0._wp)) &
+      if(any_vals_less_than(inc_flux_dif, 0._wp)) &
         error_msg = "rte_sw: one or more inc_flux_dif < 0"
     end if
 
     if(any([size(sfc_alb_dir, 1), size(sfc_alb_dir, 2)] /= [nband, ncol])) &
       error_msg = "rte_sw: sfc_alb_dir inconsistently sized"
-    if(any(sfc_alb_dir < 0._wp .or. sfc_alb_dir > 1._wp)) &
+    if(any_vals_outside(sfc_alb_dir,  0._wp, 1._wp)) &
       error_msg = "rte_sw: sfc_alb_dir out of bounds [0,1]"
     if(any([size(sfc_alb_dif, 1), size(sfc_alb_dif, 2)] /= [nband, ncol])) &
       error_msg = "rte_sw: sfc_alb_dif inconsistently sized"
-    if(any(sfc_alb_dif < 0._wp .or. sfc_alb_dif > 1._wp)) &
+    if(any_vals_outside(sfc_alb_dif,  0._wp, 1._wp)) &
       error_msg = "rte_sw: sfc_alb_dif out of bounds [0,1]"
 
-    if(len_trim(error_msg) > 0) return
-
-    !
-    ! Ensure values of tau, ssa, and g are reasonable
-    !
-    error_msg =  atmos%validate()
     if(len_trim(error_msg) > 0) then
       if(len_trim(atmos%get_name()) > 0) &
         error_msg = trim(atmos%get_name()) // ': ' // trim(error_msg)
@@ -161,6 +156,8 @@ function rte_sw(atmos, top_at_1,                 &
         ! Direct beam only
         !
         !$acc enter data copyin(atmos, atmos%tau)
+        error_msg =  atmos%validate()
+        if(len_trim(error_msg) > 0) return
         call sw_solver_noscat(ncol, nlay, ngpt, logical(top_at_1, wl), &
                               atmos%tau, mu0,                          &
                               gpt_flux_dir)
@@ -175,6 +172,8 @@ function rte_sw(atmos, top_at_1,                 &
         ! two-stream calculation with scattering
         !
         !$acc enter data copyin(atmos, atmos%tau, atmos%ssa, atmos%g)
+        error_msg =  atmos%validate()
+        if(len_trim(error_msg) > 0) return
         call sw_solver_2stream(ncol, nlay, ngpt, logical(top_at_1, wl), &
                                atmos%tau, atmos%ssa, atmos%g, mu0,      &
                                sfc_alb_dir_gpt, sfc_alb_dif_gpt,        &
@@ -189,7 +188,11 @@ function rte_sw(atmos, top_at_1,                 &
         !
         error_msg = 'sw_solver(...ty_optical_props_nstr...) not yet implemented'
     end select
-    if (error_msg /= '') return
+    if(len_trim(error_msg) > 0) then
+      if(len_trim(atmos%get_name()) > 0) &
+        error_msg = trim(atmos%get_name()) // ': ' // trim(error_msg)
+      return
+    end if
     !
     ! ...and reduce spectral fluxes to desired output quantities
     !
diff --git a/rte/mo_util_array.F90 b/rte/mo_util_array.F90
index 37efadd37..28e8f254d 100644
--- a/rte/mo_util_array.F90
+++ b/rte/mo_util_array.F90
@@ -15,47 +15,151 @@ module mo_util_array
 !    checking values and sizes
 ! These are in a module so code can be written for both CPUs and GPUs
 ! Used only by Fortran classes so routines don't need C bindings and can use assumed-shape
-! Currently only for 3D arrays; could extend through overloading to other ranks
-!
 !
   use mo_rte_kind,      only: wp
   implicit none
+  interface any_vals_less_than
+    module procedure any_vals_less_than_1D, any_vals_less_than_2D, any_vals_less_than_3D
+  end interface
+  interface any_vals_outside
+    module procedure any_vals_outside_1D, any_vals_outside_2D, any_vals_outside_3D
+  end interface
+  interface zero_array
+    module procedure zero_array_1D, zero_array_3D, zero_array_4D
+  end interface
+
   private
-  public :: any_vals_less_than, any_vals_outside
+  public :: zero_array, any_vals_less_than, any_vals_outside
 contains
+  !-------------------------------------------------------------------------------------------------
+  ! Values less than a floor
+  !-------------------------------------------------------------------------------------------------
+  logical function any_vals_less_than_1D(array, minVal)
+    real(wp), dimension(:), intent(in) :: array
+    real(wp),               intent(in) :: minVal
+
+#ifdef _OPENACC
+    ! Compact version using intrinsics below
+    ! but an explicit loop is the only current solution on GPUs
+    real(wp) :: minValue
+    integer  :: i
+
+    minValue = minVal
+    !$acc parallel loop copyin(array) reduction(min:minValue)
+    do i = 1, size(array,1)
+      minValue = min(array(i), minValue)
+    end do
+    any_vals_less_than_1D = (minValue < minVal)
+#else
+    any_vals_less_than_1D = any(array < minVal)
+#endif
+end function any_vals_less_than_1D
+  !-------------------------------------------------------------------------------------------------
+  logical function any_vals_less_than_2D(array, minVal)
+    real(wp), dimension(:,:), intent(in) :: array
+    real(wp),                 intent(in) :: minVal
+
+#ifdef _OPENACC
+    ! Compact version using intrinsics below
+    ! but an explicit loop is the only current solution on GPUs
+    real(wp) :: minValue
+    integer  :: i, j
+
+    minValue = minVal
+    !$acc parallel loop collapse(2) copyin(array) reduction(min:minValue)
+    do j = 1, size(array,2)
+      do i = 1, size(array,1)
+        minValue = min(array(i,j), minValue)
+      end do
+    end do
+    any_vals_less_than_2D = (minValue < minVal)
+#else
+    any_vals_less_than_2D = any(array < minVal)
+#endif
+  end function any_vals_less_than_2D
 !-------------------------------------------------------------------------------------------------
-  logical function any_vals_less_than(array, minVal)
+  logical function any_vals_less_than_3D(array, minVal)
     real(wp), dimension(:,:,:), intent(in) :: array
     real(wp),                   intent(in) :: minVal
 
-    real(wp) :: minValue
-    integer  :: i, j, k
+#ifdef _OPENACC
+  ! Compact version using intrinsics below
+  ! but an explicit loop is the only current solution on GPUs
+  real(wp) :: minValue
+  integer  :: i, j, k
 
-    ! This could be written far more compactly as
-    !       any_vals_less_than = any(array < minVal)
-    ! but an explicit loop also works on GPUs
+  minValue = minVal
+  !$acc parallel loop collapse(3) copyin(array) reduction(min:minValue)
+  do k = 1, size(array,3)
+    do j = 1, size(array,2)
+      do i = 1, size(array,1)
+        minValue = min(array(i,j,k), minValue)
+      end do
+    end do
+  end do
+  any_vals_less_than_3D = (minValue < minVal)
+#else
+  any_vals_less_than_3D = any(array < minVal)
+#endif
+  end function any_vals_less_than_3D
+  !-------------------------------------------------------------------------------------------------
+  ! Values outside a range
+  !-------------------------------------------------------------------------------------------------
+  logical function any_vals_outside_1D(array, minVal, maxVal)
+    real(wp), dimension(:), intent(in) :: array
+    real(wp),               intent(in) :: minVal, maxVal
+
+#ifdef _OPENACC
+      ! Compact version using intrinsics below
+      ! but an explicit loop is the only current solution on GPUs
+    real(wp) :: minValue, maxValue
+    integer  :: i
     minValue = minVal
-    !$acc parallel loop collapse(3) copyin(array) reduction(min:minValue)
-    do k = 1, size(array,3)
-      do j = 1, size(array,2)
-        do i = 1, size(array,1)
-          minValue = min(array(i,j,k), minValue)
-        end do
+    maxValue = maxVal
+    !$acc parallel loop copyin(array) reduction(min:minValue) reduction(max:maxValue)
+    do i = 1, size(array,1)
+      minValue = min(array(i), minValue)
+      maxValue = max(array(i), maxValue)
+    end do
+    any_vals_outside_1D = (minValue < minVal .or. maxValue > maxVal)
+#else
+    any_vals_outside_1D = any(array < minVal .or. array > maxVal)
+#endif
+  end function any_vals_outside_1D
+! ----------------------------------------------------------
+  logical function any_vals_outside_2D(array, minVal, maxVal)
+    real(wp), dimension(:,:), intent(in) :: array
+    real(wp),                 intent(in) :: minVal, maxVal
+
+#ifdef _OPENACC
+      ! Compact version using intrinsics below
+      ! but an explicit loop is the only current solution on GPUs
+    real(wp) :: minValue, maxValue
+    integer  :: i, j
+    minValue = minVal
+    maxValue = maxVal
+    !$acc parallel loop collapse(2) copyin(array) reduction(min:minValue) reduction(max:maxValue)
+    do j = 1, size(array,2)
+      do i = 1, size(array,1)
+        minValue = min(array(i,j), minValue)
+        maxValue = max(array(i,j), maxValue)
       end do
     end do
-    any_vals_less_than = (minValue < minVal)
-  end function any_vals_less_than
-  ! ---------------------------------
-  logical function any_vals_outside(array, minVal, maxVal)
+    any_vals_outside_2D = (minValue < minVal .or. maxValue > maxVal)
+#else
+    any_vals_outside_2D = any(array < minVal .or. array > maxVal)
+#endif
+  end function any_vals_outside_2D
+! ----------------------------------------------------------
+  logical function any_vals_outside_3D(array, minVal, maxVal)
     real(wp), dimension(:,:,:), intent(in) :: array
     real(wp),                   intent(in) :: minVal, maxVal
 
+#ifdef _OPENACC
+      ! Compact version using intrinsics below
+      ! but an explicit loop is the only current solution on GPUs
     real(wp) :: minValue, maxValue
     integer  :: i, j, k
-
-    ! This could be written far more compactly as
-    !   any_vals_outside = any(array < minVal .or. array > maxVal)
-    ! but an explicit loop also works on GPUs
     minValue = minVal
     maxValue = maxVal
     !$acc parallel loop collapse(3) copyin(array) reduction(min:minValue) reduction(max:maxValue)
@@ -63,11 +167,64 @@ logical function any_vals_outside(array, minVal, maxVal)
       do j = 1, size(array,2)
         do i = 1, size(array,1)
           minValue = min(array(i,j,k), minValue)
-          maxValue = max(array(i,j,k), minValue)
+          maxValue = max(array(i,j,k), maxValue)
+        end do
+      end do
+    end do
+    any_vals_outside_3D = (minValue < minVal .or. maxValue > maxVal)
+#else
+    any_vals_outside_3D = any(array < minVal .or. array > maxVal)
+#endif
+  end function any_vals_outside_3D
+  !-------------------------------------------------------------------------------------------------
+  ! Initializing arrays to 0
+  !-------------------------------------------------------------------------------------------------
+  subroutine zero_array_1D(ni, array) bind(C, name="zero_array_1D")
+    integer,                 intent(in ) :: ni
+    real(wp), dimension(ni), intent(out) :: array
+    ! -----------------------
+    integer :: i
+    ! -----------------------
+    !$acc parallel loop copyout(array)
+    do i = 1, ni
+      array(i) = 0.0_wp
+    end do
+  end subroutine zero_array_1D
+  ! ----------------------------------------------------------
+  subroutine zero_array_3D(ni, nj, nk, array) bind(C, name="zero_array_3D")
+    integer,                         intent(in ) :: ni, nj, nk
+    real(wp), dimension(ni, nj, nk), intent(out) :: array
+    ! -----------------------
+    integer :: i,j,k
+    ! -----------------------
+    !$acc parallel loop collapse(3) copyout(array)
+    do k = 1, nk
+      do j = 1, nj
+        do i = 1, ni
+          array(i,j,k) = 0.0_wp
         end do
       end do
     end do
-    any_vals_outside = (minValue < minVal .or. maxValue > maxVal)
-  end function any_vals_outside
-! ---------------------------------
+
+  end subroutine zero_array_3D
+  ! ----------------------------------------------------------
+  subroutine zero_array_4D(ni, nj, nk, nl, array) bind(C, name="zero_array_4D")
+    integer,                             intent(in ) :: ni, nj, nk, nl
+    real(wp), dimension(ni, nj, nk, nl), intent(out) :: array
+    ! -----------------------
+    integer :: i,j,k,l
+    ! -----------------------
+    !$acc parallel loop collapse(4) copyout(array)
+    do l = 1, nl
+      do k = 1, nk
+        do j = 1, nj
+          do i = 1, ni
+            array(i,j,k,l) = 0.0_wp
+          end do
+        end do
+      end do
+    end do
+
+  end subroutine zero_array_4D
+! ----------------------------------------------------------
 end module mo_util_array