Skip to content

Commit

Permalink
add option to bypass dfobj.loop() copy
Browse files Browse the repository at this point in the history
  • Loading branch information
cjknight committed Mar 30, 2024
1 parent 9bbb021 commit b0598dc
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 26 deletions.
79 changes: 54 additions & 25 deletions gpu/gpu4pyscf/df/df_jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13):
# dms = [numpy.asarray(x, order='F') for x in dms]
count = 0
for eri1 in dfobj.loop():
t6 = (logger.process_clock(), logger.perf_counter())
naux, nao_pair = eri1.shape
if gpu:
if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, 0, count)
Expand All @@ -88,11 +89,13 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13):
rho = numpy.einsum('ix,px->ip', dmtril, eri1)
vj += numpy.einsum('ip,px->ix', rho, eri1)

lib.logger.timer(dfobj, 'get_jk not with_k loop iteration',*t6)

count += 1

if gpu:
libgpu.libgpu_pull_get_jk(gpu, vj, vk, 0)
t3 = lib.logger.timer(dfobj, 'get_jk not with_k',*t2)
t3 = lib.logger.timer(dfobj, 'get_jk not with_k loop full',*t2)

# Commented 2-19-2024 in favor of accelerated implementation below
# Can offload this if need arises.
Expand Down Expand Up @@ -156,36 +159,62 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13):
vj = numpy.zeros_like(dmtril)

t3 = lib.logger.timer(dfobj, 'get_jk with_k setup',*t2)
for eri1 in dfobj.loop(blksize): # how much time spent unnecessarily copying eri1 data?
t6 = (logger.process_clock(), logger.perf_counter())
naux, nao_pair = eri1.shape

if gpu:
if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, naux, count)
libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, 1, count, id(dfobj))
load_eri = True

else:
if gpu:
arg = numpy.array([-1, -1, -1, -1], dtype=numpy.int32)
libgpu.libgpu_get_dfobj_status(gpu, id(dfobj), arg)
if arg[2] > -1: load_eri = False

if load_eri:

for eri1 in dfobj.loop(blksize): # how much time spent unnecessarily copying eri1 data?
t6 = (logger.process_clock(), logger.perf_counter())
naux, nao_pair = eri1.shape

if gpu:
if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, naux, count)
libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, 1, count, id(dfobj))

if with_j:
rho = numpy.einsum('ix,px->ip', dmtril, eri1)
vj += numpy.einsum('ip,px->ix', rho, eri1)
else:

for k in range(nset):
buf1 = buf[0,:naux]
fdrv(ftrans, fmmm,
buf1.ctypes.data_as(ctypes.c_void_p),
eri1.ctypes.data_as(ctypes.c_void_p),
dms[k].ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(naux), *rargs)
if with_j:
rho = numpy.einsum('ix,px->ip', dmtril, eri1)
vj += numpy.einsum('ip,px->ix', rho, eri1)

buf2 = lib.unpack_tril(eri1, out=buf[1])
vk[k] += lib.dot(buf1.reshape(-1,nao).T, buf2.reshape(-1,nao))

count+=1
lib.logger.timer(dfobj, 'get_jk with_k loop iteration',*t6)

for k in range(nset):
buf1 = buf[0,:naux]
fdrv(ftrans, fmmm,
buf1.ctypes.data_as(ctypes.c_void_p),
eri1.ctypes.data_as(ctypes.c_void_p),
dms[k].ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(naux), *rargs)

buf2 = lib.unpack_tril(eri1, out=buf[1])
vk[k] += lib.dot(buf1.reshape(-1,nao).T, buf2.reshape(-1,nao))

count+=1
lib.logger.timer(dfobj, 'get_jk with_k loop iteration',*t6)

else:

nblk = arg[2]
for count in range( arg[2] ):
t6 = (logger.process_clock(), logger.perf_counter())
arg = numpy.array([-1, -1, count, -1], dtype=numpy.int32)
libgpu.libgpu_get_dfobj_status(gpu, id(dfobj), arg)
naux = arg[0]
nao_pair = arg[1]

eri1 = numpy.zeros(1)
if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, naux, count)
libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, 1, count, id(dfobj))

lib.logger.timer(dfobj, 'get_jk with_k loop iteration',*t6)

t4 = lib.logger.timer(dfobj, 'get_jk with_k loop full',*t3)
t1 = log.timer_debug1('jk', *t1)
t4 = lib.logger.timer(dfobj, 'get_jk with_k loop',*t3)

if gpu:
libgpu.libgpu_pull_get_jk(gpu, vj, vk, 1)
Expand Down
83 changes: 83 additions & 0 deletions gpu/src/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,89 @@ void Device::set_update_dfobj_(int _val)
update_dfobj = _val; // this is reset to zero in Device::pull_get_jk
}

/* ---------------------------------------------------------------------- */

// return stored values for Python side to make decisions
// update_dfobj == true :: nothing useful to return if need to update eri blocks on device
// count_ == -1 :: return # of blocks cached for dfobj
// count_ >= 0 :: return extra data for cached block

void Device::get_dfobj_status(size_t addr_dfobj, py::array_t<int> _arg)
{
py::buffer_info info_arg = _arg.request();
int * arg = static_cast<int*>(info_arg.ptr);

int naux_ = arg[0];
int nao_pair_ = arg[1];
int count_ = arg[2];
int update_dfobj_ = arg[3];

// printf("Inside get_dfobj_status(): addr_dfobj= %#012x naux_= %i nao_pair_= %i count_= %i update_dfobj_= %i\n",
// addr_dfobj, naux_, nao_pair_, count_, update_dfobj_);

update_dfobj_ = update_dfobj;

// nothing useful to return if need to update eri blocks on device

if(update_dfobj) {
// printf("Leaving get_dfobj_status(): addr_dfobj= %#012x update_dfobj_= %i\n", addr_dfobj, update_dfobj_);

arg[3] = update_dfobj_;
return;
}

// return # of blocks cached for dfobj

if(count_ == -1) {
int id = eri_list.size();
for(int i=0; i<eri_list.size(); ++i)
if(eri_list[i] == addr_dfobj) {
id = i;
break;
}

if(id < eri_list.size()) count_ = eri_num_blocks[id];

// printf("Leaving get_dfobj_status(): addr_dfobj= %#012x count_= %i update_dfobj_= %i\n", addr_dfobj, count_, update_dfobj_);

arg[2] = count_;
arg[3] = update_dfobj_;
return;
}

// return extra data for cached block

int id = eri_list.size();
for(int i=0; i<eri_list.size(); ++i)
if(eri_list[i] == addr_dfobj+count_) {
id = i;
break;
}

// printf("eri_list.size()= %i id= %i\n",eri_list.size(), id);

naux_ = -1;
nao_pair_ = -1;

if(id < eri_list.size()) {

naux_ = eri_extra[id * _ERI_CACHE_EXTRA ];
nao_pair_ = eri_extra[id * _ERI_CACHE_EXTRA + 1];

}

arg[0] = naux_;
arg[1] = nao_pair_;
arg[2] = count_;
arg[3] = update_dfobj_;

// printf("Leaving get_dfobj_status(): addr_dfobj= %#012x id= %i naux_= %i nao_pair_= %i count_= %i update_dfobj_= %i\n",
// addr_dfobj, id, naux_, nao_pair_, count_, update_dfobj_);

// printf("Leaving get_dfobj_status(): addr_dfobj= %#012x id= %i arg= %i %i %i %i\n",
// addr_dfobj, id, arg[0], arg[1], arg[2], arg[3]);
}

/* ---------------------------------------------------------------------- */

// Is both _ocm2 in/out as it get over-written and resized?
Expand Down
6 changes: 6 additions & 0 deletions gpu/src/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ using namespace PM_NS;
#define _SIZE_BLOCK 256

#define _USE_ERI_CACHE
#define _ERI_CACHE_EXTRA 2
//#define _DEBUG_ERI_CACHE

#define OUTPUTIJ 1
Expand Down Expand Up @@ -66,7 +67,9 @@ public :
py::array_t<double>, py::array_t<double>,
int, int, size_t);
void pull_get_jk(py::array_t<double>, py::array_t<double>, int);

void set_update_dfobj_(int);
void get_dfobj_status(size_t, py::array_t<int>);

void hessop_get_veff(int, int, int, int,
py::array_t<double>, py::array_t<double>, py::array_t<double>);
Expand Down Expand Up @@ -150,6 +153,9 @@ public :
std::vector<int> eri_update; // # times particular cache updated
std::vector<int> eri_size; // # size of particular cache

std::vector<int> eri_num_blocks; // # of eri blocks for each dfobj (i.e. trip-count from `for eri1 in dfobj.loop(blksize)`)
std::vector<int> eri_extra; // per-block data: {naux, nao_pair}

std::vector<double *> d_eri_cache; // pointers for device caches
std::vector<double *> d_eri_host; // values on host for checking if update

Expand Down
11 changes: 10 additions & 1 deletion gpu/src/device_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,14 +477,17 @@ void Device::get_jk(int naux,

#ifdef _USE_ERI_CACHE

// retrieve or cache eri block
// retrieve id of cached eri block

int id = eri_list.size();
for(int i=0; i<eri_list.size(); ++i)
if(eri_list[i] == addr_dfobj+count) {
id = i;
break;
}

// grab/update cached data

if(id < eri_list.size()) {
eri_count[id]++;
d_eri = d_eri_cache[id];
Expand Down Expand Up @@ -534,6 +537,12 @@ void Device::get_jk(int naux,
eri_count.push_back(1);
eri_update.push_back(0);
eri_size.push_back(naux * nao_pair);

eri_num_blocks.push_back(0); // grow array
eri_num_blocks[id-count]++; // increment # of blocks for this dfobj

eri_extra.push_back(naux);
eri_extra.push_back(nao_pair);

d_eri_cache.push_back( (double *) pm->dev_malloc(naux * nao_pair * sizeof(double)) );
int id = d_eri_cache.size() - 1;
Expand Down
8 changes: 8 additions & 0 deletions gpu/src/libgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,14 @@ void libgpu_set_update_dfobj_(void * ptr, int update_dfobj)

/* ---------------------------------------------------------------------- */

void libgpu_get_dfobj_status(void * ptr, size_t addr_dfobj, py::array_t<int> arg)
{
Device * dev = (Device *) ptr;
dev->get_dfobj_status(addr_dfobj, arg);
}

/* ---------------------------------------------------------------------- */

void libgpu_hessop_get_veff(void * ptr,
int naux, int nmo, int ncore, int nocc,
py::array_t<double> bPpj, py::array_t<double> vPpj, py::array_t<double> vk_bj)
Expand Down
4 changes: 4 additions & 0 deletions gpu/src/libgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ extern "C"
int, int, size_t);

void libgpu_pull_get_jk(void *, py::array_t<double>, py::array_t<double>, int);

void libgpu_set_update_dfobj_(void *, int);
void libgpu_get_dfobj_status(void *, size_t, py::array_t<int>);

void libgpu_hessop_get_veff(void *,
int, int, int, int,
Expand Down Expand Up @@ -61,7 +63,9 @@ PYBIND11_MODULE(libgpu, m) {
m.def("libgpu_compute_get_jk", &libgpu_compute_get_jk, "pyscf/df/df_jk.py::get_jk()");
m.def("libgpu_init_get_jk", &libgpu_init_get_jk, "alloc for get_jk()");
m.def("libgpu_pull_get_jk", &libgpu_pull_get_jk, "retrieve vj & vk from get_jk()");

m.def("libgpu_set_update_dfobj_", &libgpu_set_update_dfobj_, "ensure that eri is updated on device for get_jk");
m.def("libgpu_get_dfobj_status", &libgpu_get_dfobj_status, "retrieve info on dfobj and cached eri blocks on device");

m.def("libgpu_hessop_get_veff", &libgpu_hessop_get_veff, "lasci_sync.py::get_veff() for HessianOperator");
m.def("libgpu_hessop_push_bPpj", &libgpu_hessop_push_bPpj, "bPpj array for HessianOperator");
Expand Down

0 comments on commit b0598dc

Please sign in to comment.