From b0598dc22dcded5a70fd2c650141565583ddc609 Mon Sep 17 00:00:00 2001 From: cjknight Date: Fri, 29 Mar 2024 21:39:31 -0500 Subject: [PATCH] add option to bypass dfobj.loop() copy --- gpu/gpu4pyscf/df/df_jk.py | 79 +++++++++++++++++++++++++------------ gpu/src/device.cpp | 83 +++++++++++++++++++++++++++++++++++++++ gpu/src/device.h | 6 +++ gpu/src/device_cuda.cpp | 11 +++++- gpu/src/libgpu.cpp | 8 ++++ gpu/src/libgpu.h | 4 ++ 6 files changed, 165 insertions(+), 26 deletions(-) diff --git a/gpu/gpu4pyscf/df/df_jk.py b/gpu/gpu4pyscf/df/df_jk.py index 4617694a..4c3a2669 100644 --- a/gpu/gpu4pyscf/df/df_jk.py +++ b/gpu/gpu4pyscf/df/df_jk.py @@ -80,6 +80,7 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13): # dms = [numpy.asarray(x, order='F') for x in dms] count = 0 for eri1 in dfobj.loop(): + t6 = (logger.process_clock(), logger.perf_counter()) naux, nao_pair = eri1.shape if gpu: if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, 0, count) @@ -88,11 +89,13 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13): rho = numpy.einsum('ix,px->ip', dmtril, eri1) vj += numpy.einsum('ip,px->ix', rho, eri1) + lib.logger.timer(dfobj, 'get_jk not with_k loop iteration',*t6) + count += 1 if gpu: libgpu.libgpu_pull_get_jk(gpu, vj, vk, 0) - t3 = lib.logger.timer(dfobj, 'get_jk not with_k',*t2) + t3 = lib.logger.timer(dfobj, 'get_jk not with_k loop full',*t2) # Commented 2-19-2024 in favor of accelerated implementation below # Can offload this if need arises. @@ -156,36 +159,62 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13): vj = numpy.zeros_like(dmtril) t3 = lib.logger.timer(dfobj, 'get_jk with_k setup',*t2) - for eri1 in dfobj.loop(blksize): # how much time spent unnecessarily copying eri1 data? - t6 = (logger.process_clock(), logger.perf_counter()) - naux, nao_pair = eri1.shape - if gpu: - if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, naux, count) - libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, 1, count, id(dfobj)) + load_eri = True - else: + if gpu: + arg = numpy.array([-1, -1, -1, -1], dtype=numpy.int32) + libgpu.libgpu_get_dfobj_status(gpu, id(dfobj), arg) + if arg[2] > -1: load_eri = False + + if load_eri: + + for eri1 in dfobj.loop(blksize): # how much time spent unnecessarily copying eri1 data? + t6 = (logger.process_clock(), logger.perf_counter()) + naux, nao_pair = eri1.shape + + if gpu: + if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, naux, count) + libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, 1, count, id(dfobj)) - if with_j: - rho = numpy.einsum('ix,px->ip', dmtril, eri1) - vj += numpy.einsum('ip,px->ix', rho, eri1) + else: - for k in range(nset): - buf1 = buf[0,:naux] - fdrv(ftrans, fmmm, - buf1.ctypes.data_as(ctypes.c_void_p), - eri1.ctypes.data_as(ctypes.c_void_p), - dms[k].ctypes.data_as(ctypes.c_void_p), - ctypes.c_int(naux), *rargs) + if with_j: + rho = numpy.einsum('ix,px->ip', dmtril, eri1) + vj += numpy.einsum('ip,px->ix', rho, eri1) - buf2 = lib.unpack_tril(eri1, out=buf[1]) - vk[k] += lib.dot(buf1.reshape(-1,nao).T, buf2.reshape(-1,nao)) - - count+=1 - lib.logger.timer(dfobj, 'get_jk with_k loop iteration',*t6) - + for k in range(nset): + buf1 = buf[0,:naux] + fdrv(ftrans, fmmm, + buf1.ctypes.data_as(ctypes.c_void_p), + eri1.ctypes.data_as(ctypes.c_void_p), + dms[k].ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(naux), *rargs) + + buf2 = lib.unpack_tril(eri1, out=buf[1]) + vk[k] += lib.dot(buf1.reshape(-1,nao).T, buf2.reshape(-1,nao)) + + count+=1 + lib.logger.timer(dfobj, 'get_jk with_k loop iteration',*t6) + + else: + + nblk = arg[2] + for count in range( arg[2] ): + t6 = (logger.process_clock(), logger.perf_counter()) + arg = numpy.array([-1, -1, count, -1], dtype=numpy.int32) + libgpu.libgpu_get_dfobj_status(gpu, id(dfobj), arg) + naux = arg[0] + nao_pair = arg[1] + + eri1 = numpy.zeros(1) + if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, naux, count) + libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, 1, count, id(dfobj)) + + lib.logger.timer(dfobj, 'get_jk with_k loop iteration',*t6) + + t4 = lib.logger.timer(dfobj, 'get_jk with_k loop full',*t3) t1 = log.timer_debug1('jk', *t1) - t4 = lib.logger.timer(dfobj, 'get_jk with_k loop',*t3) if gpu: libgpu.libgpu_pull_get_jk(gpu, vj, vk, 1) diff --git a/gpu/src/device.cpp b/gpu/src/device.cpp index 401551b5..afbcb8e5 100644 --- a/gpu/src/device.cpp +++ b/gpu/src/device.cpp @@ -212,6 +212,89 @@ void Device::set_update_dfobj_(int _val) update_dfobj = _val; // this is reset to zero in Device::pull_get_jk } +/* ---------------------------------------------------------------------- */ + +// return stored values for Python side to make decisions +// update_dfobj == true :: nothing useful to return if need to update eri blocks on device +// count_ == -1 :: return # of blocks cached for dfobj +// count_ >= 0 :: return extra data for cached block + +void Device::get_dfobj_status(size_t addr_dfobj, py::array_t _arg) +{ + py::buffer_info info_arg = _arg.request(); + int * arg = static_cast(info_arg.ptr); + + int naux_ = arg[0]; + int nao_pair_ = arg[1]; + int count_ = arg[2]; + int update_dfobj_ = arg[3]; + + // printf("Inside get_dfobj_status(): addr_dfobj= %#012x naux_= %i nao_pair_= %i count_= %i update_dfobj_= %i\n", + // addr_dfobj, naux_, nao_pair_, count_, update_dfobj_); + + update_dfobj_ = update_dfobj; + + // nothing useful to return if need to update eri blocks on device + + if(update_dfobj) { + // printf("Leaving get_dfobj_status(): addr_dfobj= %#012x update_dfobj_= %i\n", addr_dfobj, update_dfobj_); + + arg[3] = update_dfobj_; + return; + } + + // return # of blocks cached for dfobj + + if(count_ == -1) { + int id = eri_list.size(); + for(int i=0; i, py::array_t, int, int, size_t); void pull_get_jk(py::array_t, py::array_t, int); + void set_update_dfobj_(int); + void get_dfobj_status(size_t, py::array_t); void hessop_get_veff(int, int, int, int, py::array_t, py::array_t, py::array_t); @@ -150,6 +153,9 @@ public : std::vector eri_update; // # times particular cache updated std::vector eri_size; // # size of particular cache + std::vector eri_num_blocks; // # of eri blocks for each dfobj (i.e. trip-count from `for eri1 in dfobj.loop(blksize)`) + std::vector eri_extra; // per-block data: {naux, nao_pair} + std::vector d_eri_cache; // pointers for device caches std::vector d_eri_host; // values on host for checking if update diff --git a/gpu/src/device_cuda.cpp b/gpu/src/device_cuda.cpp index 0c053bab..be44341a 100644 --- a/gpu/src/device_cuda.cpp +++ b/gpu/src/device_cuda.cpp @@ -477,7 +477,8 @@ void Device::get_jk(int naux, #ifdef _USE_ERI_CACHE - // retrieve or cache eri block + // retrieve id of cached eri block + int id = eri_list.size(); for(int i=0; idev_malloc(naux * nao_pair * sizeof(double)) ); int id = d_eri_cache.size() - 1; diff --git a/gpu/src/libgpu.cpp b/gpu/src/libgpu.cpp index 0ca25e2e..1e95e251 100644 --- a/gpu/src/libgpu.cpp +++ b/gpu/src/libgpu.cpp @@ -113,6 +113,14 @@ void libgpu_set_update_dfobj_(void * ptr, int update_dfobj) /* ---------------------------------------------------------------------- */ +void libgpu_get_dfobj_status(void * ptr, size_t addr_dfobj, py::array_t arg) +{ + Device * dev = (Device *) ptr; + dev->get_dfobj_status(addr_dfobj, arg); +} + +/* ---------------------------------------------------------------------- */ + void libgpu_hessop_get_veff(void * ptr, int naux, int nmo, int ncore, int nocc, py::array_t bPpj, py::array_t vPpj, py::array_t vk_bj) diff --git a/gpu/src/libgpu.h b/gpu/src/libgpu.h index 637dbfae..ce3f2a54 100644 --- a/gpu/src/libgpu.h +++ b/gpu/src/libgpu.h @@ -30,7 +30,9 @@ extern "C" int, int, size_t); void libgpu_pull_get_jk(void *, py::array_t, py::array_t, int); + void libgpu_set_update_dfobj_(void *, int); + void libgpu_get_dfobj_status(void *, size_t, py::array_t); void libgpu_hessop_get_veff(void *, int, int, int, int, @@ -61,7 +63,9 @@ PYBIND11_MODULE(libgpu, m) { m.def("libgpu_compute_get_jk", &libgpu_compute_get_jk, "pyscf/df/df_jk.py::get_jk()"); m.def("libgpu_init_get_jk", &libgpu_init_get_jk, "alloc for get_jk()"); m.def("libgpu_pull_get_jk", &libgpu_pull_get_jk, "retrieve vj & vk from get_jk()"); + m.def("libgpu_set_update_dfobj_", &libgpu_set_update_dfobj_, "ensure that eri is updated on device for get_jk"); + m.def("libgpu_get_dfobj_status", &libgpu_get_dfobj_status, "retrieve info on dfobj and cached eri blocks on device"); m.def("libgpu_hessop_get_veff", &libgpu_hessop_get_veff, "lasci_sync.py::get_veff() for HessianOperator"); m.def("libgpu_hessop_push_bPpj", &libgpu_hessop_push_bPpj, "bPpj array for HessianOperator");