Skip to content

Commit

Permalink
Attempt to use cached eri failed for h2eff
Browse files Browse the repository at this point in the history
  • Loading branch information
Valay Agarawal committed Aug 5, 2024
1 parent e31c6c1 commit a28bf0a
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 29 deletions.
2 changes: 1 addition & 1 deletion gpu/src/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ public :
py::array_t<double>, py::array_t<double>);
void get_h2eff_df( py::array_t<double> ,
int , int , int , int , int ,
py::array_t<double> );
py::array_t<double>, int, size_t );

void push_mo_coeff(py::array_t<double>, int);
private:
Expand Down
29 changes: 25 additions & 4 deletions gpu/src/device_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1664,15 +1664,18 @@ __global__ void pack_Mwuv(double *in, double *out, int * map,int nao, int ncas,i

void Device::get_h2eff_df(py::array_t<double> _cderi,
int nao, int nmo, int ncas, int naux, int ncore,
py::array_t<double> _eri)
py::array_t<double> _eri, int count, size_t addr_dfobj)
{
#ifdef _SIMPLE_TIMER
double t0 = omp_get_wtime();
#endif

#ifdef _DEBUG_DEVICE
printf("LIBGPU :: Inside Device :: Starting h2eff_df_contract1 function");
printf("LIBGPU:: dfobj= %#012x count= %i combined= %#012x %p update_dfobj= %i\n",addr_dfobj,count,addr_dfobj+count,addr_dfobj+count,update_dfobj);
#endif
py::buffer_info info_cderi = _cderi.request(); // 2D array blksize * nao_pair

profile_start("h2eff df setup");
py::buffer_info info_eri = _eri.request(); //2D array nao * ncas
const int device_id = 0;
pm->dev_set_device(device_id);
Expand All @@ -1684,9 +1687,10 @@ void Device::get_h2eff_df(py::array_t<double> _cderi,
const int _size_cderi_unpacked = naux*nao*nao;
const int _size_mo_cas = nao*ncas;
double * eri = static_cast<double*>(info_eri.ptr);
double * cderi = static_cast<double*>(info_cderi.ptr);
double * d_mo_coeff = dd->d_mo_coeff;
double * d_mo_cas = (double*) pm->dev_malloc(_size_mo_cas*sizeof(double));
py::buffer_info info_cderi = _cderi.request(); // 2D array blksize * nao_pair
double * cderi = static_cast<double*>(info_cderi.ptr);
//
// write code to extract d_mo_cas from d_mo_coeff
{dim3 block_size(1,1,1);
Expand All @@ -1710,8 +1714,23 @@ void Device::get_h2eff_df(py::array_t<double> _cderi,
free(h_mo_coeff);
#endif
// unpacking business that should really just have been done with stored map already and also with the stored eris
#if 0
double * d_cderi;
if(use_eri_cache) {
d_cderi = dd_fetch_eri(dd, eri, addr_dfobj, count);
} else {
if(_size_cderi > dd->size_eri1) {
dd->size_eri1 = _size_cderi;
if(dd->d_eri1) pm->dev_free_async(dd->d_eri1, dd->stream);
dd->d_eri1 = (double *) pm->dev_malloc_async(_size_cderi * sizeof(double), dd->stream);
}

pm->dev_push(d_cderi, cderi, _size_cderi * sizeof(double));
}
#else
double * d_cderi=(double*) pm->dev_malloc( _size_cderi * sizeof(double));
pm->dev_push(d_cderi,cderi,_size_cderi*sizeof(double));
#endif
double * d_cderi_unpacked=(double*) pm->dev_malloc( _size_cderi_unpacked * sizeof(double));
int _size_unpack_map = nao*nao;
int * my_unpack_map = (int*) malloc(_size_unpack_map*sizeof(int));
Expand Down Expand Up @@ -2004,12 +2023,14 @@ cublasDgemm(dd->handle, CUBLAS_OP_T, CUBLAS_OP_N,
//eri[i*ncas_pair*ncas+j*ncas_pair+kl]=h_vuwM[i*ncas*ncas*ncas+j*ncas*ncas+k*ncas+l];}}}}
#
#endif

#if 1
pm->dev_free(d_cderi);
#endif
pm->dev_free(d_cderi_unpacked);
pm->dev_free(d_bPmu);
pm->dev_free(d_my_unpack_map);
free(my_unpack_map);
profile_stop();
#ifdef _SIMPLE_TIMER
double t1 = omp_get_wtime();
//t_array[8] += t1 - t0;//TODO: add the array size
Expand Down
4 changes: 2 additions & 2 deletions gpu/src/libgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,9 @@ void libgpu_h2eff_df_contract1(void * ptr,
void libgpu_get_h2eff_df(void * ptr,
py::array_t<double> cderi,
int nao, int nmo, int ncas, int naux, int ncore,
py::array_t<double> eri1)
py::array_t<double> eri1, int count, size_t addr_dfobj)
{
Device * dev = (Device *) ptr;
dev->get_h2eff_df(cderi, nao, nmo, ncas, naux, ncore, eri1);
dev->get_h2eff_df(cderi, nao, nmo, ncas, naux, ncore, eri1, count, addr_dfobj);
}
/* ---------------------------------------------------------------------- */
2 changes: 1 addition & 1 deletion gpu/src/libgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ extern "C"
void libgpu_get_h2eff_df(void * ,
py::array_t<double> ,
int , int , int , int , int ,
py::array_t<double> );
py::array_t<double>, int, size_t);
}


Expand Down
28 changes: 7 additions & 21 deletions my_pyscf/mcscf/las_ao2mo.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,7 @@ def get_h2eff_gpu (las,mo_coeff):
log = lib.logger.new_logger (las, las.verbose)
gpu=las.use_gpu
nao, nmo = mo_coeff.shape
#mo_coeff=np.random.randint(0,2,size=mo_coeff.shape)#.reshape(mo_coeff.shape) ######################################################################
#mo_coeff[0,0]=100
#mo_coeff[0,3]=-100
count = 0
ncore, ncas = las.ncore, las.ncas
nocc = ncore + ncas
mo_cas = mo_coeff[:,ncore:nocc]
Expand All @@ -116,13 +114,12 @@ def get_h2eff_gpu (las,mo_coeff):
mem_per_aux += nao*lib.num_threads () # wrk in contract1
if not isinstance (getattr (las.with_df, '_cderi', None), np.ndarray):
mem_per_aux += 3*nao*(nao+1)//2 # cderi / bPmn
# NOTE: I think a linalg.norm operation in sparsedf_array might be doubling the memory
# footprint of bPmn below
else:
mem_per_aux += nao*(nao+1) # see note above
mem_per_aux *= safety_factor * 8 / 1e6
mem_per_aux = max (1, mem_per_aux)
blksize = max (1, min (naux, int (mem_av / mem_per_aux)))
#blksize = max (1, min (naux, int (mem_av / mem_per_aux)))
blksize=las.with_df.blockdim
log.debug2 ("LAS DF ERI blksize = %d, mem_av = %d MB, mem_per_aux = %d MB", blksize, mem_av, mem_per_aux)
log.debug2 ("LAS DF ERI naux = %d, nao = %d, nmo = %d", naux, nao, nmo)
assert (blksize>1)
Expand All @@ -133,29 +130,17 @@ def get_h2eff_gpu (las,mo_coeff):
naux = cderi.shape[0]
eri1 = np.empty((nmo, int(ncas*ncas*(ncas+1)/2)),dtype='d')
if DEBUG and gpu:
libgpu.libgpu_get_h2eff_df(gpu, cderi, nao, nmo, ncas, naux, ncore,eri1)
#print('gpu',eri1)
libgpu.libgpu_get_h2eff_df(gpu, cderi, nao, nmo, ncas, naux, ncore,eri1, count, id(las.with_df))
bPmu = np.einsum('Pmn,nu->Pmu',lib.unpack_tril(cderi),mo_cas)

bPvu = np.einsum('mv,Pmu->Pvu',mo_cas.conjugate(),bPmu)

bumP = bPmu.transpose(2,1,0)
buvP = bPvu.transpose(2,1,0)
eri2 = np.einsum('uvP,wmP->uvwm', buvP, bumP)
eri2 = np.einsum('mM,uvwm->Mwvu', mo_coeff.conjugate(),eri2)
#print('final matmul',eri2)
eri2 = lib.pack_tril (eri2.reshape (nmo*ncas, ncas, ncas)).reshape (nmo, -1)
#print('cpu',eri2)
#bPmn = sparsedf_array (cderi)
#bmuP1 = bPmn.contract1 (mo_cas)
#bmuP1 = np.einsum('Pmn,nu->Pmu',lib.unpack_tril(bPmn),mo_cas)
#bmuP1 = bmuP1.transpose(1,2,0)
#buvP = np.tensordot (mo_cas.conjugate (), bmuP1, axes=((0),(0)))
#eri2 = np.tensordot (bmuP1, buvP, axes=((2),(2)))
#eri2 = np.tensordot (mo_coeff.conjugate (), eri2, axes=((0),(0)))
if np.allclose(eri1,eri2): print("h2eff is working")
else: print("h2eff not working"); exit()
elif gpu: libgpu.libgpu_get_h2eff_df(gpu, cderi, nao, nmo, ncas, naux, ncore,eri1)
elif gpu: libgpu.libgpu_get_h2eff_df(gpu, cderi, nao, nmo, ncas, naux, ncore,eri1, count, id(las.with_df))
else:
bPmn = sparsedf_array (cderi)
bmuP1 = bPmn.contract1 (mo_cas)
Expand All @@ -166,6 +151,7 @@ def get_h2eff_gpu (las,mo_coeff):
eri1 = lib.pack_tril (eri1.reshape (nmo*ncas, ncas, ncas)).reshape (nmo, -1)
cderi = bPmn = bmuP1 = buvP = None
t1 = lib.logger.timer (las, 'contract1 gpu', *t1)
count+=1
eri +=eri1
eri1= None
return eri
Expand All @@ -187,7 +173,7 @@ def get_h2eff (las, mo_coeff=None):
raise MemoryError ("{} MB of {}/{} MB av/total for ERI array".format (
mem_eris, mem_remaining, las.max_memory))
if getattr (las, 'with_df', None) is not None:
if las.use_gpu: eri = get_h2eff_df (las,mo_coeff)#the full version is not working yet.
if las.use_gpu: eri = get_h2eff_gpu (las,mo_coeff)#the full version is not working yet.
else: eri = get_h2eff_df (las, mo_coeff)
elif getattr (las._scf, '_eri', None) is not None:
eri = ao2mo.incore.general (las._scf._eri, mo, compact=True)
Expand Down

0 comments on commit a28bf0a

Please sign in to comment.