From d0cb33ebbe73b9d31e96e2b02903970d31679b00 Mon Sep 17 00:00:00 2001 From: cjknight Date: Thu, 10 Oct 2024 14:16:27 -0500 Subject: [PATCH] update _getjk_unpack_buf2 to support large workloads --- gpu/src/device_cuda.cpp | 286 ++++++++++++++++++++++++++++++---------- 1 file changed, 213 insertions(+), 73 deletions(-) diff --git a/gpu/src/device_cuda.cpp b/gpu/src/device_cuda.cpp index 1ba02a48..121bc968 100644 --- a/gpu/src/device_cuda.cpp +++ b/gpu/src/device_cuda.cpp @@ -15,7 +15,7 @@ #define _HESSOP_BLOCK_SIZE 32 #define _DEFAULT_BLOCK_SIZE 32 -//#define _DEBUG_DEVICE +#define _DEBUG_DEVICE //#define _DEBUG_H2EFF //#define _DEBUG_H2EFF2 #define _DEBUG_H2EFF_DF @@ -772,24 +772,21 @@ __global__ void _getjk_vj(double * vj, double * rho, double * eri1, int nset, in /* ---------------------------------------------------------------------- */ -#if 0 +#if 1 __global__ void _getjk_unpack_buf2(double * buf2, double * eri1, int * map, int naux, int nao, int nao_pair) { const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; if(i >= naux) return; - - int j = blockIdx.y * blockDim.y + threadIdx.y; + if(j >= nao) return; double * buf = &(buf2[i * nao * nao]); double * tril = &(eri1[i * nao_pair]); - while (j < nao*nao) { - buf[j] = tril[ map[j] ]; - j += blockDim.y; // * gridDim.y; // gridDim.y is just 1 - } - + const int indx = j * nao; + for(int k=0; kstream>>>(dd->d_rho, dd->d_dmtril, d_eri, nset, naux, nao_pair); #ifdef _DEBUG_DEVICE - printf("LIBGPU :: -- _getjk_rho :: nset= %i naux= %i RHO_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + printf("LIBGPU :: -- get_jk::_getjk_rho :: nset= %i naux= %i RHO_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", nset, naux, _RHO_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); _CUDA_CHECK_ERRORS(); #endif @@ -989,7 +985,7 @@ void Device::get_jk(int naux, int nao, int nset, _getjk_vj<<stream>>>(dd->d_vj, dd->d_rho, d_eri, nset, nao_pair, naux, init); #ifdef _DEBUG_DEVICE - printf("LIBGPU :: -- _getjk_vj :: nset= %i nao_pair= %i _DOT_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + printf("LIBGPU :: -- get_jk::_getjk_vj :: nset= %i nao_pair= %i _DOT_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", nset, nao_pair, _DOT_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); _CUDA_CHECK_ERRORS(); #endif @@ -1016,21 +1012,19 @@ void Device::get_jk(int naux, int nao, int nset, profile_start("get_jk :: with_k"); { -#if 0 - dim3 grid_size(naux, 1, 1); +#if 1 + dim3 grid_size(naux, _TILE(nao, _UNPACK_BLOCK_SIZE), 1); dim3 block_size(1, _UNPACK_BLOCK_SIZE, 1); #else - dim3 grid_size(naux, (nao*nao + (_UNPACK_BLOCK_SIZE - 1)) / _UNPACK_BLOCK_SIZE, 1); + dim3 grid_size(naux, _TILE(nao*nao, _UNPACK_BLOCK_SIZE), 1); dim3 block_size(1, _UNPACK_BLOCK_SIZE, 1); - // dim3 grid_size((naux + (_UNPACK_BLOCK_SIZE - 1)) / _UNPACK_BLOCK_SIZE, (nao*nao + (_UNPACK_BLOCK_SIZE - 1)) / _UNPACK_BLOCK_SIZE, 1); - // dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); #endif _getjk_unpack_buf2<<stream>>>(dd->d_buf2, d_eri, dd->d_pumap_ptr, naux, nao, nao_pair); #ifdef _DEBUG_DEVICE - printf("LIBGPU :: -- _getjk_unpack_buf2 :: naux= %i nao= %i _UNPACK_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", - naux, nao, _UNPACK_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + printf("LIBGPU :: -- get_jk::_getjk_unpack_buf2 :: naux= %i nao= %i _UNPACK_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nao, _UNPACK_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); _CUDA_CHECK_ERRORS(); #endif } @@ -1093,16 +1087,19 @@ void Device::get_jk(int naux, int nao, int nset, #if 1 dim3 grid_size( _TILE(naux*nao, _TRANSPOSE_BLOCK_SIZE), _TILE(nao, _TRANSPOSE_BLOCK_SIZE), 1); - dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS); - - _transpose<<stream>>>(dd->d_buf3, dd->d_buf1, naux*nao, nao); + dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); #else dim3 grid_size(naux*nao, 1, 1); dim3 block_size(1, _TRANSPOSE_BLOCK_SIZE, 1); +#endif _transpose<<stream>>>(dd->d_buf3, dd->d_buf1, naux*nao, nao); -#endif +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- get_jk::_transpose :: naux= %i nao= %i _TRANSPOSE_BLOCK_SIZE= %i _TRANSPOSE_NUM_ROWS= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nao, _TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif } // vk[k] += lib.dot(buf3, buf4) @@ -1313,9 +1310,18 @@ void Device::df_ao2mo_pass1 (int naux, int nmo, int nao, int ncore, int ncas, int * my_d_tril_map_ptr = dd_fetch_pumap(dd, nao); - { dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); - dim3 grid_size(_TILE(naux,block_size.x), _TILE(nao*nao, block_size.y), 1); - _getjk_unpack_buf2<<stream>>>(d_eri_unpacked, d_eri, my_d_tril_map_ptr, naux, nao, nao_pair); } + { + dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); + dim3 grid_size(_TILE(naux,block_size.x), _TILE(nao, block_size.y), 1); + + _getjk_unpack_buf2<<stream>>>(d_eri_unpacked, d_eri, my_d_tril_map_ptr, naux, nao, nao_pair); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- df_ao2mo_pass1::_getjk_unpack_buf2 :: naux= %i nao= %i _UNPACK_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nao, _UNPACK_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } //bufpp = mo.T @ eri @ mo //buf = np.einsum('ijk,kl->ijl',eri_unpacked,mo_coeff),i=naux,j=nao,l=nao @@ -1347,18 +1353,35 @@ void Device::df_ao2mo_pass1 (int naux, int nmo, int nao, int ncore, int ncas, double * d_bufpa = (double *) pm->dev_malloc (naux*nmo*ncas*sizeof(double)); #endif - {dim3 block_size(1,1,1); - dim3 grid_size (_TILE(naux, block_size.x), _TILE(nmo, block_size.y), ncas); - get_bufpa<<stream>>>(d_bufpp, d_bufpa, naux, nmo, ncore, ncas);} + { + dim3 block_size(1,1,1); + dim3 grid_size (_TILE(naux, block_size.x), _TILE(nmo, block_size.y), ncas); + + get_bufpa<<stream>>>(d_bufpp, d_bufpa, naux, nmo, ncore, ncas); +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- df_ao2mo_pass1::get_bufpa :: naux= %i nmo= %i ncas= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nmo, ncas, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } + pm->dev_pull_async(d_bufpa, bufpa, naux*nmo*ncas*sizeof(double), dd->stream); double * d_fxpp = dd->d_buf1; // fxpp[str(k)] =bufpp.transpose(1,2,0); { - dim3 block_size (1, 1,1); - dim3 grid_size (_TILE(naux, block_size.x),nmo,nmo) ; - transpose_120<<stream>>>(d_bufpp, d_fxpp, naux, nmo, nmo);} + dim3 block_size (1, 1,1); + dim3 grid_size (_TILE(naux, block_size.x),nmo,nmo) ; + + transpose_120<<stream>>>(d_bufpp, d_fxpp, naux, nmo, nmo); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- df_ao2mo_pass1::transpose_120 :: naux= %i nmo= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nmo, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } if (counthandle, CUBLAS_OP_N, CUBLAS_OP_T, @@ -1392,11 +1415,22 @@ else } double * d_bufd = dd->d_bufd; #else - double * d_bufd = (double *) pm->dev_malloc_async(naux*nmo*sizeof(double),dd->stream); + double * d_bufd = (double *) pm->dev_malloc_async(naux*nmo*sizeof(double),dd->stream); +#endif + + { + dim3 block_size (1,1,1); + dim3 grid_size (_TILE(naux, block_size.x),_TILE(nmo, block_size.y),1) ; + + get_bufd<<stream>>>(d_bufpp, d_bufd, naux, nmo); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- df_ao2mo_pass1::get_bufd :: naux= %i nmo= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nmo, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); #endif - {dim3 block_size (1,1,1); - dim3 grid_size (_TILE(naux, block_size.x),_TILE(nmo, block_size.y),1) ; - get_bufd<<stream>>>(d_bufpp, d_bufd, naux, nmo);} + } + // self.j_pc += numpy.einsum('ki,kj->ij', bufd, bufd[:,:ncore]) if (counthandle,CUBLAS_OP_N, CUBLAS_OP_T, @@ -1519,9 +1553,16 @@ void Device::df_ao2mo_pass1_fdrv (int naux, int nmo, int nao, int blksize, int * my_d_tril_map_ptr = dd_fetch_pumap(dd, nao); { - dim3 grid_size(_TILE(naux,_UNPACK_BLOCK_SIZE), _TILE(nao*nao, _UNPACK_BLOCK_SIZE), 1); + dim3 grid_size(_TILE(naux,_UNPACK_BLOCK_SIZE), _TILE(nao, _UNPACK_BLOCK_SIZE), 1); dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); + _getjk_unpack_buf2<<stream>>>(d_eri_unpacked, d_eri, my_d_tril_map_ptr, naux, nao, nao_pair); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- df_ao2mo_pass1_fdrv::getjk_unpack_buf2 :: naux= %i nao= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nao, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif } #ifdef _DEBUG_DEVICE @@ -1730,9 +1771,16 @@ void Device::update_h2eff_sub(int ncore, int ncas, int nocc, int nmo, //ucas = umat[ncore:nocc, ncore:nocc] { - dim3 blockDim(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE); - dim3 gridDim(_TILE(ncas,blockDim.x), _TILE(ncas,blockDim.y)); - extract_submatrix<<stream>>>(dd->d_umat, dd->d_ucas, ncas, ncore, nmo); + dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE); + dim3 grid_size(_TILE(ncas,block_size.x), _TILE(ncas,block_size.y)); + + extract_submatrix<<stream>>>(dd->d_umat, dd->d_ucas, ncas, ncore, nmo); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- update_h2eff_sub::extract_submatrix :: ncas= %i _UNPACK_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + ncas, _UNPACK_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif } //h2eff_sub = h2eff_sub.reshape (nmo*ncas, ncas*(ncas+1)//2) @@ -1760,10 +1808,16 @@ void Device::update_h2eff_sub(int ncore, int ncas, int nocc, int nmo, #endif { - dim3 blockDim(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); - dim3 gridDim(_TILE(nmo*ncas,_UNPACK_BLOCK_SIZE), _TILE(ncas*ncas,_UNPACK_BLOCK_SIZE), 1); - _unpack_h2eff_2d<<stream>>>(d_h2eff_sub, d_h2eff_unpacked, d_my_unpack_map_ptr, nmo, ncas, ncas_pair); + dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); + dim3 grid_size(_TILE(nmo*ncas,_UNPACK_BLOCK_SIZE), _TILE(ncas*ncas,_UNPACK_BLOCK_SIZE), 1); + + _unpack_h2eff_2d<<stream>>>(d_h2eff_sub, d_h2eff_unpacked, d_my_unpack_map_ptr, nmo, ncas, ncas_pair); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- update_h2eff_sub::_unpack_h2eff_2d :: nmo*ncas= %i ncas*ncas= %i _UNPACK_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + nmo*ncas, ncas*ncas, _UNPACK_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); _CUDA_CHECK_ERRORS(); +#endif } profile_next("2 dgemms"); @@ -1814,9 +1868,16 @@ void Device::update_h2eff_sub(int ncore, int ncas, int nocc, int nmo, double * d_h2eff_transposed = dd->d_buf2; { - dim3 blockDim(1,1,_DEFAULT_BLOCK_SIZE); - dim3 gridDim(_TILE(nmo,blockDim.x),_TILE(ncas,blockDim.y),_TILE(ncas,blockDim.z)); - transpose_2310<<stream>>>(d_h2eff_step2, d_h2eff_transposed, nmo, ncas); + dim3 block_size(1,1,_DEFAULT_BLOCK_SIZE); + dim3 grid_size(_TILE(nmo,block_size.x),_TILE(ncas,block_size.y),_TILE(ncas,block_size.z)); + + transpose_2310<<stream>>>(d_h2eff_step2, d_h2eff_transposed, nmo, ncas); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- update_h2eff_sub::transpose_2310 :: nmo= %i ncas= %i _DEFAULT_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + nmo, ncas, _DEFAULT_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif } profile_next("last 2 dgemm"); @@ -1857,9 +1918,16 @@ void Device::update_h2eff_sub(int ncore, int ncas, int nocc, int nmo, //h2eff_tranposed=(JKIP->PIJK) 3201 { - dim3 blockDim(1,1,_DEFAULT_BLOCK_SIZE); - dim3 gridDim(_TILE(ncas,blockDim.x),_TILE(ncas,blockDim.y),_TILE(ncas,blockDim.z)); - transpose_3210<<stream>>>(d_h2eff_step4, d_h2eff_transpose2, nmo, ncas); + dim3 block_size(1,1,_DEFAULT_BLOCK_SIZE); + dim3 grid_size(_TILE(ncas,block_size.x),_TILE(ncas,block_size.y),_TILE(ncas,block_size.z)); + + transpose_3210<<stream>>>(d_h2eff_step4, d_h2eff_transpose2, nmo, ncas); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- update_h2eff_sub::transpose_3210 :: ncas= %i _DEFAULT_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + ncas, _DEFAULT_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif } #ifdef _DEBUG_H2EFF @@ -1876,9 +1944,16 @@ void Device::update_h2eff_sub(int ncore, int ncas, int nocc, int nmo, int * d_my_pack_map_ptr = dd_fetch_pumap(dd, ncas, _PUMAP_H2EFF_PACK); { - dim3 blockDim(1, 1, _UNPACK_BLOCK_SIZE); - dim3 gridDim(nmo, ncas, _TILE(ncas_pair, _DEFAULT_BLOCK_SIZE)); - _pack_h2eff_2d<<stream>>>(d_h2eff_transpose2, d_h2eff_sub, d_my_pack_map_ptr, nmo, ncas, ncas_pair); + dim3 block_size(1, 1, _UNPACK_BLOCK_SIZE); + dim3 grid_size(nmo, ncas, _TILE(ncas_pair, _DEFAULT_BLOCK_SIZE)); + + _pack_h2eff_2d<<stream>>>(d_h2eff_transpose2, d_h2eff_sub, d_my_pack_map_ptr, nmo, ncas, ncas_pair); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- update_h2eff_sub::_pack_h2eff_2d :: nmo= %i ncas= %i _UNPACK_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + nmo, ncas, _UNPACK_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif } #ifdef _DEBUG_H2EFF @@ -1954,9 +2029,19 @@ void Device::h2eff_df_contract1(py::array_t _cderi, int * d_my_unpack_map = (int*) pm->dev_malloc(_size_unpack_map*sizeof(int)); int * d_my_unpack_map_ptr = d_my_unpack_map; pm->dev_push(d_my_unpack_map, my_unpack_map,_size_unpack_map*sizeof(int)); - {dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); - dim3 grid_size(_TILE(naux,_UNPACK_BLOCK_SIZE), _TILE(nao*nao,_UNPACK_BLOCK_SIZE)); - _getjk_unpack_buf2<<stream>>>(d_cderi_unpacked,d_cderi,d_my_unpack_map_ptr,naux, nao, nao_pair);} + + { + dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); + dim3 grid_size(_TILE(naux,_UNPACK_BLOCK_SIZE), _TILE(nao,_UNPACK_BLOCK_SIZE)); + + _getjk_unpack_buf2<<stream>>>(d_cderi_unpacked,d_cderi,d_my_unpack_map_ptr,naux, nao, nao_pair); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- h2eff_df_contract1::_getjk_unpack_buf2 :: naux= %i nao= %i _UNPACK_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nao, _UNPACK_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } //bmuP1 = bPmn.contract1 (mo_cas) //bmuP1 = np.einsum('Pmn,nu->Pmu',unpack_tril(bPmn),mo_cas) @@ -1968,9 +2053,19 @@ void Device::h2eff_df_contract1(py::array_t _cderi, &alpha, d_cderi_unpacked, nao, nao*nao, d_mo_cas, ncas, 0, &beta, d_bPmu, nao, ncas*nao, naux); _CUDA_CHECK_ERRORS(); - {dim3 block_size(_UNPACK_BLOCK_SIZE, 1, _UNPACK_BLOCK_SIZE); - dim3 grid_size(_TILE(naux,block_size.x), _TILE(ncas,block_size.y), _TILE(nao,block_size.z)); - transpose_210<<stream>>>(d_bPmu, d_bmuP,naux,nao,ncas);} + { + dim3 block_size(_UNPACK_BLOCK_SIZE, 1, _UNPACK_BLOCK_SIZE); + dim3 grid_size(_TILE(naux,block_size.x), _TILE(ncas,block_size.y), _TILE(nao,block_size.z)); + + transpose_210<<stream>>>(d_bPmu, d_bmuP,naux,nao,ncas); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- h2eff_df_contract1::transpose_210 :: naux= %i ncas= %i _UNPACK_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, ncas, _UNPACK_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } + pm->dev_pull(d_bmuP,bmuP,_size_bmuP*sizeof(double)); //pm->dev_free(d_mo_cas); pm->dev_free(d_cderi); @@ -2032,9 +2127,18 @@ void Device::get_h2eff_df(py::array_t _cderi, double * cderi = static_cast(info_cderi.ptr); // d_mo_cas - {dim3 block_size(1,1,1); - dim3 grid_size(_TILE(ncas, block_size.x), _TILE(nao, block_size.y)); - get_mo_cas<<stream>>>(d_mo_coeff, d_mo_cas, ncas, ncore, nao);} + { + dim3 block_size(1,1,1); + dim3 grid_size(_TILE(ncas, block_size.x), _TILE(nao, block_size.y)); + + get_mo_cas<<stream>>>(d_mo_coeff, d_mo_cas, ncas, ncore, nao); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- get_h2eff_df::_get_mo_cas :: ncas= %i nao= %i grid_size= %i %i %i block_size= %i %i %i\n", + ncas, nao, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } double * d_cderi; if(use_eri_cache) { @@ -2053,10 +2157,18 @@ void Device::get_h2eff_df(py::array_t _cderi, int * d_my_unpack_map_ptr = dd_fetch_pumap(dd, nao); - {dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); - dim3 grid_size(_TILE(naux,_UNPACK_BLOCK_SIZE), _TILE(nao*nao,_UNPACK_BLOCK_SIZE)); - _getjk_unpack_buf2<<stream>>>(d_cderi_unpacked,d_cderi,d_my_unpack_map_ptr,naux, nao, nao_pair);} + { + dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); + dim3 grid_size(_TILE(naux,_UNPACK_BLOCK_SIZE), _TILE(nao,_UNPACK_BLOCK_SIZE)); + + _getjk_unpack_buf2<<stream>>>(d_cderi_unpacked,d_cderi,d_my_unpack_map_ptr,naux, nao, nao_pair); +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- get_h2eff_df::_getjk_unpack_buf2 :: naux= %i nao= %i _UNPACK_BLOCK_SIZE= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nao, _UNPACK_BLOCK_SIZE, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } //bPmu = np.einsum('Pmn,nu->Pmu',cderi,mo_cas) const double alpha=1.0; @@ -2083,17 +2195,35 @@ void Device::get_h2eff_df(py::array_t _cderi, //eri = np.einsum('Pmw,Pvu->mwvu', bPmu, bPvu) //transpose bPmu double * d_bumP = (double*) pm->dev_malloc (_size_bPmu *sizeof(double)); - {dim3 block_size(1,1,1); - dim3 grid_size(_TILE(naux, block_size.x),_TILE(nao, block_size.y),_TILE(ncas, block_size.z)); - transpose_120<<stream>>>(d_bPmu, d_bumP, naux, ncas, nao);} + { + dim3 block_size(1,1,1); + dim3 grid_size(_TILE(naux, block_size.x),_TILE(nao, block_size.y),_TILE(ncas, block_size.z)); + + transpose_120<<stream>>>(d_bPmu, d_bumP, naux, ncas, nao); +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- get_h2eff_df::transpose_120 :: naux= %i nao= %i ncas= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, nao, ncas, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } + pm->dev_free(d_bPmu); double * d_buvP = (double*) pm->dev_malloc (_size_bPvu *sizeof(double)); //transpose bPvu - {dim3 block_size(1,1,1); - dim3 grid_size(_TILE(naux, block_size.x),_TILE(ncas, block_size.y),_TILE(ncas, block_size.z)); - transpose_210<<stream>>>(d_bPvu, d_buvP, naux, ncas, ncas);} + { + dim3 block_size(1,1,1); + dim3 grid_size(_TILE(naux, block_size.x),_TILE(ncas, block_size.y),_TILE(ncas, block_size.z)); + + transpose_210<<stream>>>(d_bPvu, d_buvP, naux, ncas, ncas); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- get_h2eff_df::transpose_210 :: naux= %i ncas= %i grid_size= %i %i %i block_size= %i %i %i\n", + naux, ncas, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } pm->dev_free(d_bPvu); @@ -2127,9 +2257,19 @@ cublasDgemm(dd->handle, CUBLAS_OP_T, CUBLAS_OP_N, double * d_eri = (double*) pm->dev_malloc (_size_eri*sizeof(double)); int * my_d_tril_map_ptr = dd_fetch_pumap(dd, ncas); - {dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); - dim3 grid_size(_TILE(nmo*ncas,block_size.x), _TILE(ncas*ncas,block_size.y)); - pack_d_vuwM<<stream>>>(d_vuwM,d_eri,nmo, ncas, ncas_pair, my_d_tril_map_ptr);} + { + dim3 block_size(_UNPACK_BLOCK_SIZE, _UNPACK_BLOCK_SIZE, 1); + dim3 grid_size(_TILE(nmo*ncas,block_size.x), _TILE(ncas*ncas,block_size.y)); + + pack_d_vuwM<<stream>>>(d_vuwM,d_eri,nmo, ncas, ncas_pair, my_d_tril_map_ptr); + +#ifdef _DEBUG_DEVICE + printf("LIBGPU :: -- get_h2eff_df::pack_d_vumM :: nmo*ncas= %i ncas*ncas= %i grid_size= %i %i %i block_size= %i %i %i\n", + nmo*ncas, ncas*ncas, grid_size.x,grid_size.y,grid_size.z,block_size.x,block_size.y,block_size.z); + _CUDA_CHECK_ERRORS(); +#endif + } + pm->dev_free(d_vuwM); pm->dev_pull(d_eri, eri, _size_eri*sizeof(double));