From b0598dc22dcded5a70fd2c650141565583ddc609 Mon Sep 17 00:00:00 2001
From: cjknight <cjknight2009@gmail.com>
Date: Fri, 29 Mar 2024 21:39:31 -0500
Subject: [PATCH] add option to bypass dfobj.loop() copy

---
 gpu/gpu4pyscf/df/df_jk.py | 79 +++++++++++++++++++++++++------------
 gpu/src/device.cpp        | 83 +++++++++++++++++++++++++++++++++++++++
 gpu/src/device.h          |  6 +++
 gpu/src/device_cuda.cpp   | 11 +++++-
 gpu/src/libgpu.cpp        |  8 ++++
 gpu/src/libgpu.h          |  4 ++
 6 files changed, 165 insertions(+), 26 deletions(-)

diff --git a/gpu/gpu4pyscf/df/df_jk.py b/gpu/gpu4pyscf/df/df_jk.py
index 4617694a..4c3a2669 100644
--- a/gpu/gpu4pyscf/df/df_jk.py
+++ b/gpu/gpu4pyscf/df/df_jk.py
@@ -80,6 +80,7 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13):
 #        dms = [numpy.asarray(x, order='F') for x in dms]
         count = 0
         for eri1 in dfobj.loop():
+            t6 = (logger.process_clock(), logger.perf_counter())
             naux, nao_pair = eri1.shape
             if gpu:
                 if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, 0, count)
@@ -88,11 +89,13 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13):
                 rho = numpy.einsum('ix,px->ip', dmtril, eri1)
                 vj += numpy.einsum('ip,px->ix', rho, eri1)
 
+            lib.logger.timer(dfobj, 'get_jk not with_k loop iteration',*t6)
+
             count += 1
 
         if gpu:
             libgpu.libgpu_pull_get_jk(gpu, vj, vk, 0)
-        t3 = lib.logger.timer(dfobj, 'get_jk not with_k',*t2)
+        t3 = lib.logger.timer(dfobj, 'get_jk not with_k loop full',*t2)
 
 # Commented 2-19-2024 in favor of accelerated implementation below
 # Can offload this if need arises.
@@ -156,36 +159,62 @@ def get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-13):
         vj = numpy.zeros_like(dmtril)
 
         t3 = lib.logger.timer(dfobj, 'get_jk with_k setup',*t2)
-        for eri1 in dfobj.loop(blksize): # how much time spent unnecessarily copying eri1 data?
-            t6 = (logger.process_clock(), logger.perf_counter())
-            naux, nao_pair = eri1.shape
 
-            if gpu:
-                if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, naux, count)
-                libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, 1, count, id(dfobj))
+        load_eri = True
 
-            else:
+        if gpu:
+            arg = numpy.array([-1, -1, -1, -1], dtype=numpy.int32)
+            libgpu.libgpu_get_dfobj_status(gpu, id(dfobj), arg)
+            if arg[2] > -1: load_eri = False
+
+        if load_eri:
+        
+            for eri1 in dfobj.loop(blksize): # how much time spent unnecessarily copying eri1 data?
+                t6 = (logger.process_clock(), logger.perf_counter())
+                naux, nao_pair = eri1.shape
+
+                if gpu:
+                    if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, naux, count)
+                    libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, 1, count, id(dfobj))
                 
-                if with_j:
-                    rho = numpy.einsum('ix,px->ip', dmtril, eri1)
-                    vj += numpy.einsum('ip,px->ix', rho, eri1)
+                else:
                 
-                for k in range(nset):
-                    buf1 = buf[0,:naux]
-                    fdrv(ftrans, fmmm,
-                         buf1.ctypes.data_as(ctypes.c_void_p),
-                         eri1.ctypes.data_as(ctypes.c_void_p),
-                         dms[k].ctypes.data_as(ctypes.c_void_p),
-                         ctypes.c_int(naux), *rargs)
+                    if with_j:
+                        rho = numpy.einsum('ix,px->ip', dmtril, eri1)
+                        vj += numpy.einsum('ip,px->ix', rho, eri1)
                     
-                    buf2 = lib.unpack_tril(eri1, out=buf[1])
-                    vk[k] += lib.dot(buf1.reshape(-1,nao).T, buf2.reshape(-1,nao))
-
-            count+=1
-            lib.logger.timer(dfobj, 'get_jk with_k loop iteration',*t6)
-        
+                    for k in range(nset):
+                        buf1 = buf[0,:naux]
+                        fdrv(ftrans, fmmm,
+                             buf1.ctypes.data_as(ctypes.c_void_p),
+                             eri1.ctypes.data_as(ctypes.c_void_p),
+                             dms[k].ctypes.data_as(ctypes.c_void_p),
+                             ctypes.c_int(naux), *rargs)
+                        
+                        buf2 = lib.unpack_tril(eri1, out=buf[1])
+                        vk[k] += lib.dot(buf1.reshape(-1,nao).T, buf2.reshape(-1,nao))
+
+                count+=1
+                lib.logger.timer(dfobj, 'get_jk with_k loop iteration',*t6)
+
+        else:
+            
+            nblk = arg[2]
+            for count in range( arg[2] ):
+                t6 = (logger.process_clock(), logger.perf_counter())
+                arg = numpy.array([-1, -1, count, -1], dtype=numpy.int32)
+                libgpu.libgpu_get_dfobj_status(gpu, id(dfobj), arg)
+                naux = arg[0]
+                nao_pair = arg[1]
+
+                eri1 = numpy.zeros(1)
+                if count == 0: libgpu.libgpu_init_get_jk(gpu, eri1, dmtril, blksize, nset, nao, naux, count)
+                libgpu.libgpu_compute_get_jk(gpu, naux, eri1, dmtril, dms, vj, vk, 1, count, id(dfobj))
+                
+                lib.logger.timer(dfobj, 'get_jk with_k loop iteration',*t6)
+                
+        t4 = lib.logger.timer(dfobj, 'get_jk with_k loop full',*t3)
         t1 = log.timer_debug1('jk', *t1)
-        t4 = lib.logger.timer(dfobj, 'get_jk with_k loop',*t3)
 
         if gpu:
             libgpu.libgpu_pull_get_jk(gpu, vj, vk, 1)
diff --git a/gpu/src/device.cpp b/gpu/src/device.cpp
index 401551b5..afbcb8e5 100644
--- a/gpu/src/device.cpp
+++ b/gpu/src/device.cpp
@@ -212,6 +212,89 @@ void Device::set_update_dfobj_(int _val)
   update_dfobj = _val; // this is reset to zero in Device::pull_get_jk
 }
 
+/* ---------------------------------------------------------------------- */
+
+// return stored values for Python side to make decisions
+// update_dfobj == true :: nothing useful to return if need to update eri blocks on device
+// count_ == -1 :: return # of blocks cached for dfobj
+// count_ >= 0 :: return extra data for cached block
+
+void Device::get_dfobj_status(size_t addr_dfobj, py::array_t<int> _arg)
+{
+  py::buffer_info info_arg = _arg.request();
+  int * arg = static_cast<int*>(info_arg.ptr);
+  
+  int naux_ = arg[0];
+  int nao_pair_ = arg[1];
+  int count_ = arg[2];
+  int update_dfobj_ = arg[3];
+  
+  // printf("Inside get_dfobj_status(): addr_dfobj= %#012x  naux_= %i  nao_pair_= %i  count_= %i  update_dfobj_= %i\n",
+  // 	 addr_dfobj, naux_, nao_pair_, count_, update_dfobj_);
+  
+  update_dfobj_ = update_dfobj;
+
+  // nothing useful to return if need to update eri blocks on device
+  
+  if(update_dfobj) { 
+    // printf("Leaving get_dfobj_status(): addr_dfobj= %#012x  update_dfobj_= %i\n", addr_dfobj, update_dfobj_);
+    
+    arg[3] = update_dfobj_;
+    return;
+  }
+  
+  // return # of blocks cached for dfobj
+
+  if(count_ == -1) {
+    int id = eri_list.size();
+    for(int i=0; i<eri_list.size(); ++i)
+      if(eri_list[i] == addr_dfobj) {
+	id = i;
+	break;
+      }
+
+    if(id < eri_list.size()) count_ = eri_num_blocks[id];
+    
+    // printf("Leaving get_dfobj_status(): addr_dfobj= %#012x  count_= %i  update_dfobj_= %i\n", addr_dfobj, count_, update_dfobj_);
+
+    arg[2] = count_;
+    arg[3] = update_dfobj_;
+    return;
+  }
+
+  // return extra data for cached block
+  
+  int id = eri_list.size();
+  for(int i=0; i<eri_list.size(); ++i)
+    if(eri_list[i] == addr_dfobj+count_) {
+      id = i;
+      break;
+    }
+
+  // printf("eri_list.size()= %i  id= %i\n",eri_list.size(), id);
+  
+  naux_ = -1;
+  nao_pair_ = -1;
+  
+  if(id < eri_list.size()) {
+  
+    naux_     = eri_extra[id * _ERI_CACHE_EXTRA    ];
+    nao_pair_ = eri_extra[id * _ERI_CACHE_EXTRA + 1];
+
+  }
+
+  arg[0] = naux_;
+  arg[1] = nao_pair_;
+  arg[2] = count_;
+  arg[3] = update_dfobj_;
+  
+  // printf("Leaving get_dfobj_status(): addr_dfobj= %#012x  id= %i  naux_= %i  nao_pair_= %i  count_= %i  update_dfobj_= %i\n",
+  // 	 addr_dfobj, id, naux_, nao_pair_, count_, update_dfobj_);
+  
+  // printf("Leaving get_dfobj_status(): addr_dfobj= %#012x  id= %i  arg= %i %i %i %i\n",
+  // 	 addr_dfobj, id, arg[0], arg[1], arg[2], arg[3]);
+}
+
 /* ---------------------------------------------------------------------- */
     
 // Is both _ocm2 in/out as it get over-written and resized?
diff --git a/gpu/src/device.h b/gpu/src/device.h
index 98641659..ec825210 100644
--- a/gpu/src/device.h
+++ b/gpu/src/device.h
@@ -20,6 +20,7 @@ using namespace PM_NS;
 #define _SIZE_BLOCK 256
 
 #define _USE_ERI_CACHE
+#define _ERI_CACHE_EXTRA 2
 //#define _DEBUG_ERI_CACHE
 
 #define OUTPUTIJ        1
@@ -66,7 +67,9 @@ public :
 	      py::array_t<double>, py::array_t<double>,
 	      int, int, size_t);
   void pull_get_jk(py::array_t<double>, py::array_t<double>, int);
+  
   void set_update_dfobj_(int);
+  void get_dfobj_status(size_t, py::array_t<int>);
   
   void hessop_get_veff(int, int, int, int,
 		       py::array_t<double>, py::array_t<double>, py::array_t<double>);
@@ -150,6 +153,9 @@ public :
   std::vector<int> eri_update; // # times particular cache updated
   std::vector<int> eri_size; // # size of particular cache
 
+  std::vector<int> eri_num_blocks; // # of eri blocks for each dfobj (i.e. trip-count from `for eri1 in dfobj.loop(blksize)`)
+  std::vector<int> eri_extra; // per-block data: {naux, nao_pair}
+
   std::vector<double *> d_eri_cache; // pointers for device caches
   std::vector<double *> d_eri_host; // values on host for checking if update
   
diff --git a/gpu/src/device_cuda.cpp b/gpu/src/device_cuda.cpp
index 0c053bab..be44341a 100644
--- a/gpu/src/device_cuda.cpp
+++ b/gpu/src/device_cuda.cpp
@@ -477,7 +477,8 @@ void Device::get_jk(int naux,
 
 #ifdef _USE_ERI_CACHE
 
-  // retrieve or cache eri block
+  // retrieve id of cached eri block
+
   int id = eri_list.size();
   for(int i=0; i<eri_list.size(); ++i)
     if(eri_list[i] == addr_dfobj+count) {
@@ -485,6 +486,8 @@ void Device::get_jk(int naux,
       break;
     }
 
+  // grab/update cached data
+  
   if(id < eri_list.size()) {
     eri_count[id]++;
     d_eri = d_eri_cache[id];
@@ -534,6 +537,12 @@ void Device::get_jk(int naux,
     eri_count.push_back(1);
     eri_update.push_back(0);
     eri_size.push_back(naux * nao_pair);
+
+    eri_num_blocks.push_back(0); // grow array
+    eri_num_blocks[id-count]++;  // increment # of blocks for this dfobj
+
+    eri_extra.push_back(naux);
+    eri_extra.push_back(nao_pair);
     
     d_eri_cache.push_back( (double *) pm->dev_malloc(naux * nao_pair * sizeof(double)) );
     int id = d_eri_cache.size() - 1;
diff --git a/gpu/src/libgpu.cpp b/gpu/src/libgpu.cpp
index 0ca25e2e..1e95e251 100644
--- a/gpu/src/libgpu.cpp
+++ b/gpu/src/libgpu.cpp
@@ -113,6 +113,14 @@ void libgpu_set_update_dfobj_(void * ptr, int update_dfobj)
 
 /* ---------------------------------------------------------------------- */
 
+void libgpu_get_dfobj_status(void * ptr, size_t addr_dfobj, py::array_t<int> arg)
+{ 
+  Device * dev = (Device *) ptr;
+  dev->get_dfobj_status(addr_dfobj, arg);
+}
+
+/* ---------------------------------------------------------------------- */
+
 void libgpu_hessop_get_veff(void * ptr,
 			    int naux, int nmo, int ncore, int nocc,
 			    py::array_t<double> bPpj, py::array_t<double> vPpj, py::array_t<double> vk_bj)
diff --git a/gpu/src/libgpu.h b/gpu/src/libgpu.h
index 637dbfae..ce3f2a54 100644
--- a/gpu/src/libgpu.h
+++ b/gpu/src/libgpu.h
@@ -30,7 +30,9 @@ extern "C"
 			     int, int, size_t);
   
   void libgpu_pull_get_jk(void *, py::array_t<double>, py::array_t<double>, int);
+  
   void libgpu_set_update_dfobj_(void *, int);
+  void libgpu_get_dfobj_status(void *, size_t, py::array_t<int>);
   
   void libgpu_hessop_get_veff(void *,
 			      int, int, int, int,
@@ -61,7 +63,9 @@ PYBIND11_MODULE(libgpu, m) {
   m.def("libgpu_compute_get_jk", &libgpu_compute_get_jk, "pyscf/df/df_jk.py::get_jk()");
   m.def("libgpu_init_get_jk", &libgpu_init_get_jk, "alloc for get_jk()");
   m.def("libgpu_pull_get_jk", &libgpu_pull_get_jk, "retrieve vj & vk from get_jk()");
+  
   m.def("libgpu_set_update_dfobj_", &libgpu_set_update_dfobj_, "ensure that eri is updated on device for get_jk");
+  m.def("libgpu_get_dfobj_status", &libgpu_get_dfobj_status, "retrieve info on dfobj and cached eri blocks on device");
 
   m.def("libgpu_hessop_get_veff", &libgpu_hessop_get_veff, "lasci_sync.py::get_veff() for HessianOperator");
   m.def("libgpu_hessop_push_bPpj", &libgpu_hessop_push_bPpj, "bPpj array for HessianOperator");