Skip to content

Commit

Permalink
Merge branch 'valay_gpu' into valay_gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
valay1 authored Jul 29, 2024
2 parents 0807b09 + 8fb6328 commit 62451ef
Show file tree
Hide file tree
Showing 8 changed files with 424 additions and 149 deletions.
10 changes: 8 additions & 2 deletions gpu/src/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

#include "device.h"

#define _NUM_SIMPLE_TIMER 6
#define _NUM_SIMPLE_TIMER 8

#define _DEBUG_OPENMP

Expand Down Expand Up @@ -150,7 +150,13 @@ Device::~Device()
printf("LIBGPU :: SIMPLE_TIMER :: i= %i name= df_ao2mo_pass1_fdrv() time= %f s\n",5,t_array[5]);

printf("\nLIBGPU :: SIMPLE_TIMER :: _update_h2eff\n");
printf("LIBGPU :: SIMPLE_TIMER :: i= %i name= update_h2eff_sub() time= %f s\n",5,t_array[6]);
printf("LIBGPU :: SIMPLE_TIMER :: i= %i name= update_h2eff_sub() time= %f s\n",6,t_array[6]);

printf("\nLIBGPU :: SIMPLE_TIMER :: _contract1 \n");
printf("LIBGPU :: SIMPLE_TIMER :: i= %i name= h2eff_contract1() time= %f s\n",7,t_array[7]);

printf("\nLIBGPU :: SIMPLE_TIMER :: transfer_mo_coeff \n");
printf("LIBGPU :: SIMPLE_TIMER :: i= %i name= transfer_mo_coeff() time= %f s\n",8,t_array[8]);
printf("LIBGPU :: SIMPLE_TIMER :: total= %f s\n",total);

free(t_array);
Expand Down
5 changes: 4 additions & 1 deletion gpu/src/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ public :
void h2eff_df_contract1(py::array_t<double>,
int, int, int, int, int,
py::array_t<double>, py::array_t<double>);

void get_h2eff_df( py::array_t<double> ,
int , int , int , int , int ,
py::array_t<double> );

void transfer_mo_coeff(py::array_t<double>, int);
private:

Expand Down
411 changes: 289 additions & 122 deletions gpu/src/device_cuda.cpp

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions gpu/src/libgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,12 @@ void libgpu_h2eff_df_contract1(void * ptr,
}

/* ---------------------------------------------------------------------- */
void libgpu_get_h2eff_df(void * ptr,
py::array_t<double> cderi,
int nao, int nmo, int ncas, int naux, int ncore,
py::array_t<double> eri1)
{
Device * dev = (Device *) ptr;
dev->get_h2eff_df(cderi, nao, nmo, ncas, naux, ncore, eri1);
}
/* ---------------------------------------------------------------------- */
7 changes: 7 additions & 0 deletions gpu/src/libgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ extern "C"
py::array_t<double> ,
int , int , int , int , int ,
py::array_t<double> ,py::array_t<double> );
void libgpu_get_h2eff_df(void * ,
py::array_t<double> ,
int , int , int , int , int ,
py::array_t<double> );

}


Expand All @@ -83,6 +88,8 @@ PYBIND11_MODULE(libgpu, m) {
m.def("libgpu_df_ao2mo_pass1_fdrv", &libgpu_df_ao2mo_pass1_fdrv, "pyscf/mcscf/df.py::_ERIS.__init__() part 1");
m.def("libgpu_update_h2eff_sub", &libgpu_update_h2eff_sub, "my_pyscf/mcscf/lasci_sync.py::_update_h2_eff()");
m.def("libgpu_h2eff_df_contract1", &libgpu_h2eff_df_contract1, "my_pyscf/df/sparse_df.py::contract1");

m.def("libgpu_get_h2eff_df", &libgpu_get_h2eff_df, "my_pyscf/mcscf/las_ao2mo.py::get_h2eff_df");

m.def("libgpu_orbital_response", &libgpu_orbital_response, "mrh/lasscf_sync_o0.py::orbital_response");
}
Expand Down
102 changes: 95 additions & 7 deletions my_pyscf/mcscf/las_ao2mo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
from pyscf import ao2mo, lib
from mrh.my_pyscf.df.sparse_df import sparsedf_array
from mrh.my_pyscf.gpu import libgpu
DEBUG=False
DEBUG=True
def get_h2eff_df (las, mo_coeff):
# Store intermediate with one contracted ao index for faster calculation of exchange!
log = lib.logger.new_logger (las, las.verbose)
gpu=las.use_gpu
nao, nmo = mo_coeff.shape
print('full mo_coeff',mo_coeff)
ncore, ncas = las.ncore, las.ncas
nocc = ncore + ncas
mo_cas = mo_coeff[:,ncore:nocc]
if gpu:
libgpu.libgpu_transfer_mo_coeff(gpu,mo_cas.copy(),mo_cas.size)#TODO: make it async
naux = las.with_df.get_naoaux ()
log.debug2 ("LAS DF ERIs: %d MB used of %d MB total available", lib.current_memory ()[0], las.max_memory)
mem_eris = 8*(nao+nmo)*ncas*ncas*ncas / 1e6
Expand All @@ -23,10 +24,8 @@ def get_h2eff_df (las, mo_coeff):
if mem_enough_int:
mem_av -= mem_int
bmuP = []
print("LAS DF ERI including intermediate cache")
log.debug ("LAS DF ERI including intermediate cache")
else:
print("LAS DF ERI not including intermediate cache")
log.debug ("LAS DF ERI not including intermediate cache")
safety_factor = 1.1
mem_per_aux = nao*ncas # bmuP
Expand Down Expand Up @@ -73,19 +72,107 @@ def get_h2eff_df (las, mo_coeff):
str (bPmn.shape), str (np.shares_memory (bPmn, cderi)),
str (np.may_share_memory (bPmn, cderi)),
str (bPmn.flags['C_CONTIGUOUS']))
if mem_enough_int: bmuP.append (bmuP1)
if mem_enough_int and not gpu: bmuP.append (bmuP1)
buvP = np.tensordot (mo_cas.conjugate (), bmuP1, axes=((0),(0)))
eri1 = np.tensordot (bmuP1, buvP, axes=((2),(2)))
eri1 = np.tensordot (mo_coeff.conjugate (), eri1, axes=((0),(0)))
eri += lib.pack_tril (eri1.reshape (nmo*ncas, ncas, ncas)).reshape (nmo, -1)
cderi = bPmn = bmuP1 = buvP = eri1 = None
t1 = lib.logger.timer (las, 'rest of the calculation', *t1)
if mem_enough_int: eri = lib.tag_array (eri, bmPu=np.concatenate (bmuP, axis=-1).transpose (0,2,1))
if mem_enough_int and not gpu: eri = lib.tag_array (eri, bmPu=np.concatenate (bmuP, axis=-1).transpose (0,2,1))
if las.verbose > lib.logger.DEBUG:
eri_comp = las.with_df.ao2mo (mo, compact=True)
lib.logger.debug(las,"CDERI two-step error: {}".format(linalg.norm(eri-eri_comp)))
return eri

def get_h2eff_gpu (las,mo_coeff):
log = lib.logger.new_logger (las, las.verbose)
gpu=las.use_gpu
nao, nmo = mo_coeff.shape
#mo_coeff=np.random.randint(0,2,size=mo_coeff.shape)#.reshape(mo_coeff.shape) ######################################################################
#mo_coeff[0,0]=100
#mo_coeff[0,3]=-100
ncore, ncas = las.ncore, las.ncas
nocc = ncore + ncas
mo_cas = mo_coeff[:,ncore:nocc]
if gpu: libgpu.libgpu_transfer_mo_coeff(gpu,mo_coeff.copy(),mo_coeff.size)#TODO: make it async
naux = las.with_df.get_naoaux ()
if gpu: blksize = 50
else:
mem_eris = 8*(nao+nmo)*ncas*ncas*ncas / 1e6
mem_eris += 8*lib.num_threads ()*nao*nmo / 1e6
mem_av = las.max_memory - lib.current_memory ()[0] - mem_eris
mem_int = 16*naux*ncas*nao / 1e6
mem_enough_int = mem_av > mem_int
if mem_enough_int:
mem_av -= mem_int
bmuP = []
log.debug ("LAS DF ERI including intermediate cache")
else:
log.debug ("LAS DF ERI not including intermediate cache")
safety_factor = 1.1
mem_per_aux = nao*ncas # bmuP
mem_per_aux += ncas*ncas # buvP
mem_per_aux += nao*lib.num_threads () # wrk in contract1
if not isinstance (getattr (las.with_df, '_cderi', None), np.ndarray):
mem_per_aux += 3*nao*(nao+1)//2 # cderi / bPmn
# NOTE: I think a linalg.norm operation in sparsedf_array might be doubling the memory
# footprint of bPmn below
else:
mem_per_aux += nao*(nao+1) # see note above
mem_per_aux *= safety_factor * 8 / 1e6
mem_per_aux = max (1, mem_per_aux)
blksize = max (1, min (naux, int (mem_av / mem_per_aux)))
log.debug2 ("LAS DF ERI blksize = %d, mem_av = %d MB, mem_per_aux = %d MB", blksize, mem_av, mem_per_aux)
log.debug2 ("LAS DF ERI naux = %d, nao = %d, nmo = %d", naux, nao, nmo)
assert (blksize>1)
eri = 0
t0 = (lib.logger.process_clock (), lib.logger.perf_counter ())
for cderi in las.with_df.loop (blksize=blksize):
t1 = lib.logger.timer (las, 'Sparsedf', *t0)
naux = cderi.shape[0]
eri1 = np.empty((nmo, int(ncas*ncas*(ncas+1)/2)),dtype='d')
if DEBUG and gpu:
libgpu.libgpu_get_h2eff_df(gpu, cderi, nao, nmo, ncas, naux, ncore,eri1)
print('gpu',eri1)
bPmu = np.einsum('Pmn,nu->Pmu',lib.unpack_tril(cderi),mo_cas)

bPvu = np.einsum('mv,Pmu->Pvu',mo_cas.conjugate(),bPmu)

bumP = bPmu.transpose(2,1,0)
buvP = bPvu.transpose(2,1,0)
eri2 = np.einsum('uvP,wmP->uvwm', buvP, bumP)
eri2 = np.einsum('mM,uvwm->Mwvu', mo_coeff.conjugate(),eri2)
print('final matmul',eri2)
eri2 = lib.pack_tril (eri2.reshape (nmo*ncas, ncas, ncas)).reshape (nmo, -1)
print('cpu',eri2)
#bPmn = sparsedf_array (cderi)
#bmuP1 = bPmn.contract1 (mo_cas)
#bmuP1 = np.einsum('Pmn,nu->Pmu',lib.unpack_tril(bPmn),mo_cas)
#bmuP1 = bmuP1.transpose(1,2,0)
#buvP = np.tensordot (mo_cas.conjugate (), bmuP1, axes=((0),(0)))
#eri2 = np.tensordot (bmuP1, buvP, axes=((2),(2)))
#eri2 = np.tensordot (mo_coeff.conjugate (), eri2, axes=((0),(0)))
if np.allclose(eri1,eri2): print("h2eff is working")
else: print("h2eff not working"); exit()
elif gpu: libgpu.libgpu_get_h2eff_df(gpu, cderi, nao, nmo, ncas, naux, ncore,eri1)
else:
bPmn = sparsedf_array (cderi)
bmuP1 = bPmn.contract1 (mo_cas)
#bmuP1 = np.einsum('Pmn,nu->muP',unpack_tril(bPmn),mo_cas)
buvP = np.tensordot (mo_cas.conjugate (), bmuP1, axes=((0),(0)))
eri1 = np.tensordot (bmuP1, buvP, axes=((2),(2)))
eri1 = np.tensordot (mo_coeff.conjugate (), eri1, axes=((0),(0)))
eri1 = lib.pack_tril (eri1.reshape (nmo*ncas, ncas, ncas)).reshape (nmo, -1)
cderi = bPmn = bmuP1 = buvP = None
t1 = lib.logger.timer (las, 'contract1 gpu', *t1)
eri +=eri1
eri1= None
return eri




def get_h2eff (las, mo_coeff=None):
if mo_coeff is None: mo_coeff = las.mo_coeff
nao, nmo = mo_coeff.shape
Expand All @@ -100,7 +187,8 @@ def get_h2eff (las, mo_coeff=None):
raise MemoryError ("{} MB of {}/{} MB av/total for ERI array".format (
mem_eris, mem_remaining, las.max_memory))
if getattr (las, 'with_df', None) is not None:
eri = get_h2eff_df (las, mo_coeff)
if las.use_gpu: eri = get_h2eff_df (las,mo_coeff)#the full version is not working yet.
else: eri = get_h2eff_df (las, mo_coeff)
elif getattr (las._scf, '_eri', None) is not None:
eri = ao2mo.incore.general (las._scf._eri, mo, compact=True)
else:
Expand Down
7 changes: 4 additions & 3 deletions my_pyscf/mcscf/lasci.py
Original file line number Diff line number Diff line change
Expand Up @@ -1956,16 +1956,17 @@ def fast_veffa (self, casdm1s_sub, h2eff_sub, mo_coeff=None, ci=None, _full=Fals
ncas = sum (ncas_sub)
nocc = ncore + ncas
nao, nmo = mo_coeff.shape

gpu=self.use_gpu
mo_cas = mo_coeff[:,ncore:nocc]
moH_cas = mo_cas.conjugate ().T
moH_coeff = mo_coeff.conjugate ().T
dma = linalg.block_diag (*[dm[0] for dm in casdm1s_sub])
dmb = linalg.block_diag (*[dm[1] for dm in casdm1s_sub])
casdm1s = np.stack ([dma, dmb], axis=0)
if not (isinstance (self, _DFLASCI)):
if gpu or not (isinstance (self, _DFLASCI)):
dm1s = np.dot (mo_cas, np.dot (casdm1s, moH_cas)).transpose (1,0,2)
return self.get_veff (dm1s = dm1s, spin_sep=True)
if not _full: dm1s = dm1s[0]+dm1s[1]
return self.get_veff (dm1s = dm1s, spin_sep=_full)
casdm1 = casdm1s.sum (0)
dm1 = np.dot (mo_cas, np.dot (casdm1, moH_cas))
bPmn = sparsedf_array (self.with_df._cderi)
Expand Down
22 changes: 8 additions & 14 deletions my_pyscf/mcscf/lasci_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from mrh.my_pyscf.gpu import libgpu

# Setting DEBUG = True will execute both CPU (original) and GPU (new) paths checking for consistency
DEBUG = False
DEBUG = True

if DEBUG:
import math
Expand Down Expand Up @@ -1329,9 +1329,9 @@ def update_mo_ci_eri (self, x, h2eff_sub):
if(np.allclose(h2eff_sub,h2eff_sub2,atol=1e-13)): print('H2eff test passed')
else:print('H2eff gpu kernel is not working');print(np.max((h2eff_sub-h2eff_sub2)*(h2eff_sub-h2eff_sub2)));exit()
elif gpu:
h2eff_sub = self._update_h2eff_sub_gpu (gpu, mo1, umat, h2eff_sub,log, t0)
h2eff_sub = self._update_h2eff_sub_gpu (gpu, mo1, umat, h2eff_sub)
else:
h2eff_sub = self._update_h2eff_sub (mo1, umat, h2eff_sub,log,t0)
h2eff_sub = self._update_h2eff_sub (mo1, umat, h2eff_sub)
t0=log.timer('update_h2eff_sub',*t0)
return mo1, ci1, h2eff_sub

Expand All @@ -1357,20 +1357,17 @@ def _update_ci (self, dci):
ci1.append (ci1_r)
return ci1

def _update_h2eff_sub_gpu(self,gpu,mo1,umat,h2eff_sub,log,t0):
def _update_h2eff_sub_gpu(self,gpu,mo1,umat,h2eff_sub):
ncore, ncas, nocc, nmo = self.ncore, self.ncas, self.nocc, self.nmo
ucas = umat[ncore:nocc, ncore:nocc]
bmPu = None
if hasattr (h2eff_sub, 'bmPu'): bmPu = h2eff_sub.bmPu
t0 = log.timer('setup',*t0)
libgpu.libgpu_update_h2eff_sub(gpu,self.ncore,self.ncas,self.nocc,self.nmo,umat, h2eff_sub)
t0 = log.timer('libgpu call to h2eff_update',*t0)
if bmPu is not None:
bmPu = np.dot (bmPu, ucas)
h2eff_sub = lib.tag_array (h2eff_sub, bmPu = bmPu)
t0 = log.timer('bmPu work',*t0)
return h2eff_sub

return h2eff_sub

def _update_h2eff_sub_debug(self, mo1, umat, h2eff_sub):
ncore, ncas, nocc, nmo = self.ncore, self.ncas, self.nocc, self.nmo
ucas = umat[ncore:nocc, ncore:nocc]
Expand All @@ -1388,12 +1385,11 @@ def _update_h2eff_sub_debug(self, mo1, umat, h2eff_sub):
h2eff_sub = h2eff_sub[:,:,(ix_i*ncas)+ix_j]
h2eff_sub = h2eff_sub.reshape (nmo, -1)
return h2eff_sub
def _update_h2eff_sub (self, mo1, umat, h2eff_sub, log, t0):
def _update_h2eff_sub (self, mo1, umat, h2eff_sub):
ncore, ncas, nocc, nmo = self.ncore, self.ncas, self.nocc, self.nmo
ucas = umat[ncore:nocc, ncore:nocc]
bmPu = None
if hasattr (h2eff_sub, 'bmPu'): print('here');bmPu = h2eff_sub.bmPu
t0 = log.timer('setup',*t0)
if hasattr (h2eff_sub, 'bmPu'):bmPu = h2eff_sub.bmPu
h2eff_sub = h2eff_sub.reshape (nmo*ncas, ncas*(ncas+1)//2)
h2eff_sub = lib.numpy_helper.unpack_tril (h2eff_sub)
h2eff_sub = h2eff_sub.reshape (nmo, ncas, ncas, ncas)
Expand All @@ -1405,11 +1401,9 @@ def _update_h2eff_sub (self, mo1, umat, h2eff_sub, log, t0):
h2eff_sub = h2eff_sub.reshape (nmo, ncas, ncas*ncas)
h2eff_sub = h2eff_sub[:,:,(ix_i*ncas)+ix_j]
h2eff_sub = h2eff_sub.reshape (nmo, -1)
t0 = log.timer('big math',*t0)
if bmPu is not None:
bmPu = np.dot (bmPu, ucas)
h2eff_sub = lib.tag_array (h2eff_sub, bmPu = bmPu)
t0 = log.timer('bmPu work',*t0)
return h2eff_sub

def get_grad (self):
Expand Down

0 comments on commit 62451ef

Please sign in to comment.