diff --git a/include_test/Netcdf_interface.h b/include_test/Netcdf_interface.h
index f64f304..d5c52dd 100644
--- a/include_test/Netcdf_interface.h
+++ b/include_test/Netcdf_interface.h
@@ -264,7 +264,7 @@ inline Netcdf_file::Netcdf_file(const std::string& name, Netcdf_mode mode) :
     int nc_check_code = 0;
 
     if (mode == Netcdf_mode::Create)
-        nc_check_code = nc_create(name.c_str(), NC_NOCLOBBER | NC_NETCDF4, &ncid);
+        nc_check_code = nc_create(name.c_str(), NC_CLOBBER | NC_NETCDF4, &ncid);
     else if (mode == Netcdf_mode::Write)
         nc_check_code = nc_open(name.c_str(), NC_WRITE | NC_NETCDF4, &ncid);
     else if (mode == Netcdf_mode::Read)
diff --git a/src_cuda/Rte_lw.cu b/src_cuda/Rte_lw.cu
index 61deea3..a913637 100644
--- a/src_cuda/Rte_lw.cu
+++ b/src_cuda/Rte_lw.cu
@@ -56,27 +56,6 @@ namespace
     }
 }
 
-//    template<typename Float>
-//    void apply_BC(
-//            int ncol, int nlay, int ngpt,
-//            Bool top_at_1, Array<Float,3>& gpt_flux_dn)
-//    {
-//        rrtmgp_kernels::apply_BC_0(
-//                &ncol, &nlay, &ngpt,
-//                &top_at_1, gpt_flux_dn.ptr());
-//    }
-//
-//    template<typename Float>
-//    void apply_BC(
-//            int ncol, int nlay, int ngpt,
-//            Bool top_at_1, const Array<Float,2>& inc_flux,
-//            Array<Float,3>& gpt_flux_dn)
-//    {
-//        rrtmgp_kernels::apply_BC_gpt(
-//                &ncol, &nlay, &ngpt,
-//                &top_at_1, const_cast<Float*>(inc_flux.ptr()), gpt_flux_dn.ptr());
-//    }
-
 
 void Rte_lw_gpu::rte_lw(
         const std::unique_ptr<Optical_props_arry_gpu>& optical_props,
@@ -134,7 +113,7 @@ void Rte_lw_gpu::rte_lw(
 
     // pass null ptr if size of inc_flux is zero
     const Float* inc_flux_ptr = (inc_flux.size() == 0) ? nullptr : inc_flux.ptr();
-    
+
     Rte_solver_kernels_cuda::lw_solver_noscat(
             ncol, nlay, ngpt, top_at_1, n_quad_angs,
             secants.ptr(), gauss_wts_subset.ptr(),
diff --git a/src_kernels_cuda/gas_optics_rrtmgp_kernels.cu b/src_kernels_cuda/gas_optics_rrtmgp_kernels.cu
index df35b86..46cdef1 100644
--- a/src_kernels_cuda/gas_optics_rrtmgp_kernels.cu
+++ b/src_kernels_cuda/gas_optics_rrtmgp_kernels.cu
@@ -229,6 +229,7 @@ void Planck_source_kernel(
     // THIS KERNEL USES FORTRAN INDEXING TO AVOID MISTAKES.
     const int icol = blockIdx.x*blockDim.x + threadIdx.x + 1;
     const int ilay = blockIdx.y*blockDim.y + threadIdx.y + 1;
+    const int igpt = blockIdx.z*blockDim.z + threadIdx.z + 1;
 
     // Input arrays, use Index functor to simplify index porting from Fortran to CUDA
     const Index_2d<const Float> tlay        (tlay_ptr, ncol, nlay);
@@ -251,66 +252,63 @@ void Planck_source_kernel(
     Index_3d<Float> lev_src(lev_src_ptr, ncol, nlay+1, ngpt);
     Index_2d<Float> sfc_src_jac(sfc_src_jac_ptr, ncol, ngpt);
 
-    if ( (icol <= ncol) && (ilay <= nlay) )
+    if ( (icol <= ncol) && (ilay <= nlay) && (igpt <= ngpt) )
     {
-        for (int igpt=1; igpt<=ngpt; ++igpt)
+        const int ibnd = gpoint_bands(igpt);
+        const int itropo = (tropo(icol, ilay) == Bool(true)) ? 1 : 2;
+        const int iflav = gpoint_flavor(itropo, igpt);
+
+        // 3D interp.
+        const Float pfrac =
+              ( fmajor(1, 1, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)  , jpress(icol, ilay)-1 + itropo, igpt)
+              + fmajor(2, 1, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)+1, jpress(icol, ilay)-1 + itropo, igpt)
+              + fmajor(1, 2, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)  , jpress(icol, ilay)   + itropo, igpt)
+              + fmajor(2, 2, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)+1, jpress(icol, ilay)   + itropo, igpt) )
+
+            + ( fmajor(1, 1, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)  , jpress(icol, ilay)-1 + itropo, igpt)
+              + fmajor(2, 1, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)+1, jpress(icol, ilay)-1 + itropo, igpt)
+              + fmajor(1, 2, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)  , jpress(icol, ilay)   + itropo, igpt)
+              + fmajor(2, 2, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)+1, jpress(icol, ilay)   + itropo, igpt) );
+
+        Float planck_function_1 = interpolate1D(tlay(icol, ilay), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
+        lay_src(icol, ilay, igpt) = pfrac * planck_function_1;
+
+        planck_function_1 = interpolate1D(tlev(icol, ilay), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
+        if (ilay == 1)
         {
-            const int ibnd = gpoint_bands(igpt);
-            const int itropo = (tropo(icol, ilay) == Bool(true)) ? 1 : 2;
+            lev_src(icol, ilay, igpt) = pfrac * planck_function_1;
+        }
+        else
+        {
+            const int itropo = (tropo(icol, ilay-1) == Bool(true)) ? 1 : 2;
             const int iflav = gpoint_flavor(itropo, igpt);
 
-            // 3D interp.
-            const Float pfrac =
-                  ( fmajor(1, 1, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)  , jpress(icol, ilay)-1 + itropo, igpt)
-                  + fmajor(2, 1, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)+1, jpress(icol, ilay)-1 + itropo, igpt)
-                  + fmajor(1, 2, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)  , jpress(icol, ilay)   + itropo, igpt)
-                  + fmajor(2, 2, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)+1, jpress(icol, ilay)   + itropo, igpt) )
-
-                + ( fmajor(1, 1, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)  , jpress(icol, ilay)-1 + itropo, igpt)
-                  + fmajor(2, 1, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)+1, jpress(icol, ilay)-1 + itropo, igpt)
-                  + fmajor(1, 2, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)  , jpress(icol, ilay)   + itropo, igpt)
-                  + fmajor(2, 2, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)+1, jpress(icol, ilay)   + itropo, igpt) );
-
-            Float planck_function_1 = interpolate1D(tlay(icol, ilay), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
-            lay_src(icol, ilay, igpt) = pfrac * planck_function_1;
-
-            planck_function_1 = interpolate1D(tlev(icol, ilay), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
-            if (ilay == 1)
-            {
-                lev_src(icol, ilay, igpt) = pfrac * planck_function_1;
-            }
-            else
-            {
-                const int itropo = (tropo(icol, ilay-1) == Bool(true)) ? 1 : 2;
-                const int iflav = gpoint_flavor(itropo, igpt);
-
-                const Float pfrac_m1 =
-                      ( fmajor(1, 1, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)  , jpress(icol, ilay-1)-1 + itropo, igpt)
-                      + fmajor(2, 1, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)-1 + itropo, igpt)
-                      + fmajor(1, 2, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)  , jpress(icol, ilay-1)   + itropo, igpt)
-                      + fmajor(2, 2, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)   + itropo, igpt) )
+            const Float pfrac_m1 =
+                  ( fmajor(1, 1, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)  , jpress(icol, ilay-1)-1 + itropo, igpt)
+                  + fmajor(2, 1, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)-1 + itropo, igpt)
+                  + fmajor(1, 2, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)  , jpress(icol, ilay-1)   + itropo, igpt)
+                  + fmajor(2, 2, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)   + itropo, igpt) )
 
-                    + ( fmajor(1, 1, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)  , jpress(icol, ilay-1)-1 + itropo, igpt)
-                      + fmajor(2, 1, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)-1 + itropo, igpt)
-                      + fmajor(1, 2, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)  , jpress(icol, ilay-1)   + itropo, igpt)
-                      + fmajor(2, 2, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)   + itropo, igpt) );
+                + ( fmajor(1, 1, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)  , jpress(icol, ilay-1)-1 + itropo, igpt)
+                  + fmajor(2, 1, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)-1 + itropo, igpt)
+                  + fmajor(1, 2, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)  , jpress(icol, ilay-1)   + itropo, igpt)
+                  + fmajor(2, 2, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)   + itropo, igpt) );
 
-                lev_src(icol, ilay, igpt) = sqrt(pfrac * pfrac_m1) * planck_function_1;
-            }
+            lev_src(icol, ilay, igpt) = sqrt(pfrac * pfrac_m1) * planck_function_1;
+        }
 
-            if (ilay == nlay)
-            {
-                planck_function_1 = interpolate1D(tlev(icol, ilay+1), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
-                lev_src(icol, ilay+1, igpt) = pfrac * planck_function_1;
-            }
+        if (ilay == nlay)
+        {
+            planck_function_1 = interpolate1D(tlev(icol, ilay+1), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
+            lev_src(icol, ilay+1, igpt) = pfrac * planck_function_1;
+        }
 
-            if (ilay == sfc_lay)
-            {
-                            planck_function_1 = interpolate1D(tsfc(icol)              , temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
-                const Float planck_function_2 = interpolate1D(tsfc(icol) + delta_Tsurf, temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
-                sfc_src    (icol, igpt) = pfrac * planck_function_1;
-                sfc_src_jac(icol, igpt) = pfrac * (planck_function_2 - planck_function_1);
-            }
+        if (ilay == sfc_lay)
+        {
+                        planck_function_1 = interpolate1D(tsfc(icol)              , temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
+            const Float planck_function_2 = interpolate1D(tsfc(icol) + delta_Tsurf, temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd));
+            sfc_src    (icol, igpt) = pfrac * planck_function_1;
+            sfc_src_jac(icol, igpt) = pfrac * (planck_function_2 - planck_function_1);
         }
     }
 }
diff --git a/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu b/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu
index e17440b..30f9ddd 100644
--- a/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu
+++ b/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu
@@ -473,23 +473,17 @@ namespace Gas_optics_rrtmgp_kernels_cuda
 
         const Float delta_Tsurf = Float(1.);
 
-        const int block_col = 32;
-        const int block_lay = 4;
-
-        const int grid_col = ncol/block_col + (ncol%block_col > 0);
-        const int grid_lay = nlay/block_lay + (nlay%block_lay > 0);
-
-        dim3 grid_gpu(grid_col, grid_lay);
-        dim3 block_gpu(block_col, block_lay);
+        dim3 grid_gpu;
+        dim3 block_gpu;
         
         if (tunings.count("Planck_source_kernel") == 0)
         {
             std::tie(grid_gpu, block_gpu) = tune_kernel(
                     "Planck_source_kernel",
-                    dim3(ncol, nlay),
-                    {1, 2, 4, 8, 16, 32, 48, 64, 96, 128, 256, 512},
-                    {1, 2, 4, 8, 16, 32, 48, 64, 96, 128, 256, 512},
+                    dim3(ncol, nlay, ngpt),
+                    {4, 8, 16, 32, 48, 64, 96, 128},
                     {1},
+                    {4, 8, 16, 32, 48, 64, 96, 128},
                     Planck_source_kernel,
                     ncol, nlay, nbnd, ngpt,
                     nflav, neta, npres, ntemp, nPlanckTemp,
@@ -510,7 +504,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda
             block_gpu = tunings["Planck_source_kernel"].second;
         }
 
-        grid_gpu = calc_grid_size(block_gpu, dim3(ncol, nlay));
+        grid_gpu = calc_grid_size(block_gpu, dim3(ncol, nlay, ngpt));
 
         Planck_source_kernel<<<grid_gpu, block_gpu>>>(
                 ncol, nlay, nbnd, ngpt,