diff --git a/include_test/Netcdf_interface.h b/include_test/Netcdf_interface.h index f64f304..d5c52dd 100644 --- a/include_test/Netcdf_interface.h +++ b/include_test/Netcdf_interface.h @@ -264,7 +264,7 @@ inline Netcdf_file::Netcdf_file(const std::string& name, Netcdf_mode mode) : int nc_check_code = 0; if (mode == Netcdf_mode::Create) - nc_check_code = nc_create(name.c_str(), NC_NOCLOBBER | NC_NETCDF4, &ncid); + nc_check_code = nc_create(name.c_str(), NC_CLOBBER | NC_NETCDF4, &ncid); else if (mode == Netcdf_mode::Write) nc_check_code = nc_open(name.c_str(), NC_WRITE | NC_NETCDF4, &ncid); else if (mode == Netcdf_mode::Read) diff --git a/src_cuda/Rte_lw.cu b/src_cuda/Rte_lw.cu index 61deea3..a913637 100644 --- a/src_cuda/Rte_lw.cu +++ b/src_cuda/Rte_lw.cu @@ -56,27 +56,6 @@ namespace } } -// template -// void apply_BC( -// int ncol, int nlay, int ngpt, -// Bool top_at_1, Array& gpt_flux_dn) -// { -// rrtmgp_kernels::apply_BC_0( -// &ncol, &nlay, &ngpt, -// &top_at_1, gpt_flux_dn.ptr()); -// } -// -// template -// void apply_BC( -// int ncol, int nlay, int ngpt, -// Bool top_at_1, const Array& inc_flux, -// Array& gpt_flux_dn) -// { -// rrtmgp_kernels::apply_BC_gpt( -// &ncol, &nlay, &ngpt, -// &top_at_1, const_cast(inc_flux.ptr()), gpt_flux_dn.ptr()); -// } - void Rte_lw_gpu::rte_lw( const std::unique_ptr& optical_props, @@ -134,7 +113,7 @@ void Rte_lw_gpu::rte_lw( // pass null ptr if size of inc_flux is zero const Float* inc_flux_ptr = (inc_flux.size() == 0) ? nullptr : inc_flux.ptr(); - + Rte_solver_kernels_cuda::lw_solver_noscat( ncol, nlay, ngpt, top_at_1, n_quad_angs, secants.ptr(), gauss_wts_subset.ptr(), diff --git a/src_kernels_cuda/gas_optics_rrtmgp_kernels.cu b/src_kernels_cuda/gas_optics_rrtmgp_kernels.cu index df35b86..46cdef1 100644 --- a/src_kernels_cuda/gas_optics_rrtmgp_kernels.cu +++ b/src_kernels_cuda/gas_optics_rrtmgp_kernels.cu @@ -229,6 +229,7 @@ void Planck_source_kernel( // THIS KERNEL USES FORTRAN INDEXING TO AVOID MISTAKES. const int icol = blockIdx.x*blockDim.x + threadIdx.x + 1; const int ilay = blockIdx.y*blockDim.y + threadIdx.y + 1; + const int igpt = blockIdx.z*blockDim.z + threadIdx.z + 1; // Input arrays, use Index functor to simplify index porting from Fortran to CUDA const Index_2d tlay (tlay_ptr, ncol, nlay); @@ -251,66 +252,63 @@ void Planck_source_kernel( Index_3d lev_src(lev_src_ptr, ncol, nlay+1, ngpt); Index_2d sfc_src_jac(sfc_src_jac_ptr, ncol, ngpt); - if ( (icol <= ncol) && (ilay <= nlay) ) + if ( (icol <= ncol) && (ilay <= nlay) && (igpt <= ngpt) ) { - for (int igpt=1; igpt<=ngpt; ++igpt) + const int ibnd = gpoint_bands(igpt); + const int itropo = (tropo(icol, ilay) == Bool(true)) ? 1 : 2; + const int iflav = gpoint_flavor(itropo, igpt); + + // 3D interp. + const Float pfrac = + ( fmajor(1, 1, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav) , jpress(icol, ilay)-1 + itropo, igpt) + + fmajor(2, 1, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)+1, jpress(icol, ilay)-1 + itropo, igpt) + + fmajor(1, 2, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav) , jpress(icol, ilay) + itropo, igpt) + + fmajor(2, 2, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)+1, jpress(icol, ilay) + itropo, igpt) ) + + + ( fmajor(1, 1, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav) , jpress(icol, ilay)-1 + itropo, igpt) + + fmajor(2, 1, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)+1, jpress(icol, ilay)-1 + itropo, igpt) + + fmajor(1, 2, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav) , jpress(icol, ilay) + itropo, igpt) + + fmajor(2, 2, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)+1, jpress(icol, ilay) + itropo, igpt) ); + + Float planck_function_1 = interpolate1D(tlay(icol, ilay), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); + lay_src(icol, ilay, igpt) = pfrac * planck_function_1; + + planck_function_1 = interpolate1D(tlev(icol, ilay), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); + if (ilay == 1) { - const int ibnd = gpoint_bands(igpt); - const int itropo = (tropo(icol, ilay) == Bool(true)) ? 1 : 2; + lev_src(icol, ilay, igpt) = pfrac * planck_function_1; + } + else + { + const int itropo = (tropo(icol, ilay-1) == Bool(true)) ? 1 : 2; const int iflav = gpoint_flavor(itropo, igpt); - // 3D interp. - const Float pfrac = - ( fmajor(1, 1, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav) , jpress(icol, ilay)-1 + itropo, igpt) - + fmajor(2, 1, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)+1, jpress(icol, ilay)-1 + itropo, igpt) - + fmajor(1, 2, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav) , jpress(icol, ilay) + itropo, igpt) - + fmajor(2, 2, 1, icol, ilay, iflav) * pfracin(jtemp(icol, ilay), jeta(1, icol, ilay, iflav)+1, jpress(icol, ilay) + itropo, igpt) ) - - + ( fmajor(1, 1, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav) , jpress(icol, ilay)-1 + itropo, igpt) - + fmajor(2, 1, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)+1, jpress(icol, ilay)-1 + itropo, igpt) - + fmajor(1, 2, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav) , jpress(icol, ilay) + itropo, igpt) - + fmajor(2, 2, 2, icol, ilay, iflav) * pfracin(jtemp(icol, ilay)+1, jeta(2, icol, ilay, iflav)+1, jpress(icol, ilay) + itropo, igpt) ); - - Float planck_function_1 = interpolate1D(tlay(icol, ilay), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); - lay_src(icol, ilay, igpt) = pfrac * planck_function_1; - - planck_function_1 = interpolate1D(tlev(icol, ilay), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); - if (ilay == 1) - { - lev_src(icol, ilay, igpt) = pfrac * planck_function_1; - } - else - { - const int itropo = (tropo(icol, ilay-1) == Bool(true)) ? 1 : 2; - const int iflav = gpoint_flavor(itropo, igpt); - - const Float pfrac_m1 = - ( fmajor(1, 1, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav) , jpress(icol, ilay-1)-1 + itropo, igpt) - + fmajor(2, 1, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)-1 + itropo, igpt) - + fmajor(1, 2, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav) , jpress(icol, ilay-1) + itropo, igpt) - + fmajor(2, 2, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)+1, jpress(icol, ilay-1) + itropo, igpt) ) + const Float pfrac_m1 = + ( fmajor(1, 1, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav) , jpress(icol, ilay-1)-1 + itropo, igpt) + + fmajor(2, 1, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)-1 + itropo, igpt) + + fmajor(1, 2, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav) , jpress(icol, ilay-1) + itropo, igpt) + + fmajor(2, 2, 1, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1), jeta(1, icol, ilay-1, iflav)+1, jpress(icol, ilay-1) + itropo, igpt) ) - + ( fmajor(1, 1, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav) , jpress(icol, ilay-1)-1 + itropo, igpt) - + fmajor(2, 1, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)-1 + itropo, igpt) - + fmajor(1, 2, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav) , jpress(icol, ilay-1) + itropo, igpt) - + fmajor(2, 2, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)+1, jpress(icol, ilay-1) + itropo, igpt) ); + + ( fmajor(1, 1, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav) , jpress(icol, ilay-1)-1 + itropo, igpt) + + fmajor(2, 1, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)+1, jpress(icol, ilay-1)-1 + itropo, igpt) + + fmajor(1, 2, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav) , jpress(icol, ilay-1) + itropo, igpt) + + fmajor(2, 2, 2, icol, ilay-1, iflav) * pfracin(jtemp(icol, ilay-1)+1, jeta(2, icol, ilay-1, iflav)+1, jpress(icol, ilay-1) + itropo, igpt) ); - lev_src(icol, ilay, igpt) = sqrt(pfrac * pfrac_m1) * planck_function_1; - } + lev_src(icol, ilay, igpt) = sqrt(pfrac * pfrac_m1) * planck_function_1; + } - if (ilay == nlay) - { - planck_function_1 = interpolate1D(tlev(icol, ilay+1), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); - lev_src(icol, ilay+1, igpt) = pfrac * planck_function_1; - } + if (ilay == nlay) + { + planck_function_1 = interpolate1D(tlev(icol, ilay+1), temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); + lev_src(icol, ilay+1, igpt) = pfrac * planck_function_1; + } - if (ilay == sfc_lay) - { - planck_function_1 = interpolate1D(tsfc(icol) , temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); - const Float planck_function_2 = interpolate1D(tsfc(icol) + delta_Tsurf, temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); - sfc_src (icol, igpt) = pfrac * planck_function_1; - sfc_src_jac(icol, igpt) = pfrac * (planck_function_2 - planck_function_1); - } + if (ilay == sfc_lay) + { + planck_function_1 = interpolate1D(tsfc(icol) , temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); + const Float planck_function_2 = interpolate1D(tsfc(icol) + delta_Tsurf, temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk(1, ibnd)); + sfc_src (icol, igpt) = pfrac * planck_function_1; + sfc_src_jac(icol, igpt) = pfrac * (planck_function_2 - planck_function_1); } } } diff --git a/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu b/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu index e17440b..30f9ddd 100644 --- a/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu +++ b/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu @@ -473,23 +473,17 @@ namespace Gas_optics_rrtmgp_kernels_cuda const Float delta_Tsurf = Float(1.); - const int block_col = 32; - const int block_lay = 4; - - const int grid_col = ncol/block_col + (ncol%block_col > 0); - const int grid_lay = nlay/block_lay + (nlay%block_lay > 0); - - dim3 grid_gpu(grid_col, grid_lay); - dim3 block_gpu(block_col, block_lay); + dim3 grid_gpu; + dim3 block_gpu; if (tunings.count("Planck_source_kernel") == 0) { std::tie(grid_gpu, block_gpu) = tune_kernel( "Planck_source_kernel", - dim3(ncol, nlay), - {1, 2, 4, 8, 16, 32, 48, 64, 96, 128, 256, 512}, - {1, 2, 4, 8, 16, 32, 48, 64, 96, 128, 256, 512}, + dim3(ncol, nlay, ngpt), + {4, 8, 16, 32, 48, 64, 96, 128}, {1}, + {4, 8, 16, 32, 48, 64, 96, 128}, Planck_source_kernel, ncol, nlay, nbnd, ngpt, nflav, neta, npres, ntemp, nPlanckTemp, @@ -510,7 +504,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda block_gpu = tunings["Planck_source_kernel"].second; } - grid_gpu = calc_grid_size(block_gpu, dim3(ncol, nlay)); + grid_gpu = calc_grid_size(block_gpu, dim3(ncol, nlay, ngpt)); Planck_source_kernel<<>>( ncol, nlay, nbnd, ngpt,