Merge branch 'valay_gpu' into valay_gpu

cjknight · Jul 29, 2024 · 62451ef · 62451ef
2 parents 0807b09 + 8fb6328
commit 62451ef
Show file tree

Hide file tree

Showing 8 changed files with 424 additions and 149 deletions.
diff --git a/gpu/src/device.cpp b/gpu/src/device.cpp
@@ -4,7 +4,7 @@
 
 #include "device.h"
 
-#define _NUM_SIMPLE_TIMER 6
+#define _NUM_SIMPLE_TIMER 8
 
 #define _DEBUG_OPENMP
 
@@ -150,7 +150,13 @@ Device::~Device()
   printf("LIBGPU :: SIMPLE_TIMER :: i= %i  name= df_ao2mo_pass1_fdrv() time= %f s\n",5,t_array[5]);
 
   printf("\nLIBGPU :: SIMPLE_TIMER :: _update_h2eff\n");
-  printf("LIBGPU :: SIMPLE_TIMER :: i= %i  name= update_h2eff_sub() time= %f s\n",5,t_array[6]);
+  printf("LIBGPU :: SIMPLE_TIMER :: i= %i  name= update_h2eff_sub() time= %f s\n",6,t_array[6]);
+
+  printf("\nLIBGPU :: SIMPLE_TIMER :: _contract1 \n");
+  printf("LIBGPU :: SIMPLE_TIMER :: i= %i  name= h2eff_contract1() time= %f s\n",7,t_array[7]);
+
+  printf("\nLIBGPU :: SIMPLE_TIMER :: transfer_mo_coeff \n");
+  printf("LIBGPU :: SIMPLE_TIMER :: i= %i  name= transfer_mo_coeff() time= %f s\n",8,t_array[8]);
   printf("LIBGPU :: SIMPLE_TIMER :: total= %f s\n",total);
 
   free(t_array);

diff --git a/gpu/src/device.h b/gpu/src/device.h
@@ -90,7 +90,10 @@ public :
   void h2eff_df_contract1(py::array_t<double>, 
                      int, int, int, int, int,
                      py::array_t<double>, py::array_t<double>);
-
+ void get_h2eff_df( py::array_t<double> , 
+                    int , int , int , int , int ,
+                    py::array_t<double> );
+
   void transfer_mo_coeff(py::array_t<double>, int);
 private:
 

diff --git a/gpu/src/device_cuda.cpp b/gpu/src/device_cuda.cpp
diff --git a/gpu/src/libgpu.cpp b/gpu/src/libgpu.cpp
@@ -184,3 +184,12 @@ void libgpu_h2eff_df_contract1(void * ptr,
 }
 
 /* ---------------------------------------------------------------------- */
+void libgpu_get_h2eff_df(void * ptr, 
+                           py::array_t<double> cderi, 
+                           int nao, int nmo, int ncas, int naux, int ncore,
+                           py::array_t<double> eri1)
+{
+  Device * dev = (Device *) ptr;
+  dev->get_h2eff_df(cderi, nao, nmo, ncas, naux, ncore, eri1); 
+}
+/* ---------------------------------------------------------------------- */
diff --git a/gpu/src/libgpu.h b/gpu/src/libgpu.h
@@ -57,6 +57,11 @@ extern "C"
                            py::array_t<double> ,  
                            int , int , int , int , int ,
                            py::array_t<double> ,py::array_t<double> );
+  void libgpu_get_h2eff_df(void * , 
+                           py::array_t<double> , 
+                           int , int , int , int , int ,
+                           py::array_t<double> );
+
 }
 
 
@@ -83,6 +88,8 @@ PYBIND11_MODULE(libgpu, m) {
   m.def("libgpu_df_ao2mo_pass1_fdrv", &libgpu_df_ao2mo_pass1_fdrv, "pyscf/mcscf/df.py::_ERIS.__init__() part 1");
   m.def("libgpu_update_h2eff_sub", &libgpu_update_h2eff_sub, "my_pyscf/mcscf/lasci_sync.py::_update_h2_eff()");
   m.def("libgpu_h2eff_df_contract1", &libgpu_h2eff_df_contract1, "my_pyscf/df/sparse_df.py::contract1");
+
+  m.def("libgpu_get_h2eff_df", &libgpu_get_h2eff_df, "my_pyscf/mcscf/las_ao2mo.py::get_h2eff_df");
 
   m.def("libgpu_orbital_response", &libgpu_orbital_response, "mrh/lasscf_sync_o0.py::orbital_response");
 }

diff --git a/my_pyscf/mcscf/las_ao2mo.py b/my_pyscf/mcscf/las_ao2mo.py
@@ -3,16 +3,17 @@
 from pyscf import ao2mo, lib
 from mrh.my_pyscf.df.sparse_df import sparsedf_array
 from mrh.my_pyscf.gpu import libgpu
-DEBUG=False
+DEBUG=True
 def get_h2eff_df (las, mo_coeff):
     # Store intermediate with one contracted ao index for faster calculation of exchange!
     log = lib.logger.new_logger (las, las.verbose)
     gpu=las.use_gpu
     nao, nmo = mo_coeff.shape
-    print('full mo_coeff',mo_coeff)
     ncore, ncas = las.ncore, las.ncas
     nocc = ncore + ncas
     mo_cas = mo_coeff[:,ncore:nocc]
+    if gpu: 
+        libgpu.libgpu_transfer_mo_coeff(gpu,mo_cas.copy(),mo_cas.size)#TODO: make it async 
     naux = las.with_df.get_naoaux ()
     log.debug2 ("LAS DF ERIs: %d MB used of %d MB total available", lib.current_memory ()[0], las.max_memory)
     mem_eris = 8*(nao+nmo)*ncas*ncas*ncas / 1e6
@@ -23,10 +24,8 @@ def get_h2eff_df (las, mo_coeff):
     if mem_enough_int:
         mem_av -= mem_int
         bmuP = []
-        print("LAS DF ERI including intermediate cache")
         log.debug ("LAS DF ERI including intermediate cache")
     else:
-        print("LAS DF ERI not including intermediate cache")
         log.debug ("LAS DF ERI not including intermediate cache")
     safety_factor = 1.1
     mem_per_aux = nao*ncas # bmuP
@@ -73,19 +72,107 @@ def get_h2eff_df (las, mo_coeff):
                       str (bPmn.shape), str (np.shares_memory (bPmn, cderi)),
                       str (np.may_share_memory (bPmn, cderi)),
                       str (bPmn.flags['C_CONTIGUOUS']))
-        if mem_enough_int: bmuP.append (bmuP1)
+        if mem_enough_int and not gpu: bmuP.append (bmuP1)
         buvP = np.tensordot (mo_cas.conjugate (), bmuP1, axes=((0),(0)))
         eri1 = np.tensordot (bmuP1, buvP, axes=((2),(2)))
         eri1 = np.tensordot (mo_coeff.conjugate (), eri1, axes=((0),(0)))
         eri += lib.pack_tril (eri1.reshape (nmo*ncas, ncas, ncas)).reshape (nmo, -1)
         cderi = bPmn = bmuP1 = buvP = eri1 = None
         t1 = lib.logger.timer (las, 'rest of the calculation', *t1)
-    if mem_enough_int: eri = lib.tag_array (eri, bmPu=np.concatenate (bmuP, axis=-1).transpose (0,2,1))
+    if mem_enough_int and not gpu: eri = lib.tag_array (eri, bmPu=np.concatenate (bmuP, axis=-1).transpose (0,2,1))
     if las.verbose > lib.logger.DEBUG:
         eri_comp = las.with_df.ao2mo (mo, compact=True)
         lib.logger.debug(las,"CDERI two-step error: {}".format(linalg.norm(eri-eri_comp)))
     return eri
 
+def get_h2eff_gpu (las,mo_coeff):
+    log = lib.logger.new_logger (las, las.verbose)
+    gpu=las.use_gpu
+    nao, nmo = mo_coeff.shape
+    #mo_coeff=np.random.randint(0,2,size=mo_coeff.shape)#.reshape(mo_coeff.shape) ######################################################################
+    #mo_coeff[0,0]=100
+    #mo_coeff[0,3]=-100
+    ncore, ncas = las.ncore, las.ncas
+    nocc = ncore + ncas
+    mo_cas = mo_coeff[:,ncore:nocc]
+    if gpu: libgpu.libgpu_transfer_mo_coeff(gpu,mo_coeff.copy(),mo_coeff.size)#TODO: make it async 
+    naux = las.with_df.get_naoaux ()
+    if gpu: blksize = 50
+    else:
+        mem_eris = 8*(nao+nmo)*ncas*ncas*ncas / 1e6
+        mem_eris += 8*lib.num_threads ()*nao*nmo / 1e6 
+        mem_av = las.max_memory - lib.current_memory ()[0] - mem_eris
+        mem_int = 16*naux*ncas*nao / 1e6
+        mem_enough_int = mem_av > mem_int
+        if mem_enough_int:
+            mem_av -= mem_int
+            bmuP = []
+            log.debug ("LAS DF ERI including intermediate cache")
+        else:
+            log.debug ("LAS DF ERI not including intermediate cache")
+        safety_factor = 1.1
+        mem_per_aux = nao*ncas # bmuP
+        mem_per_aux += ncas*ncas # buvP
+        mem_per_aux += nao*lib.num_threads () # wrk in contract1
+        if not isinstance (getattr (las.with_df, '_cderi', None), np.ndarray):
+            mem_per_aux += 3*nao*(nao+1)//2 # cderi / bPmn
+            # NOTE: I think a linalg.norm operation in sparsedf_array might be doubling the memory
+            # footprint of bPmn below
+        else:
+            mem_per_aux += nao*(nao+1) # see note above
+        mem_per_aux *= safety_factor * 8 / 1e6
+        mem_per_aux = max (1, mem_per_aux)
+        blksize = max (1, min (naux, int (mem_av / mem_per_aux)))
+        log.debug2 ("LAS DF ERI blksize = %d, mem_av = %d MB, mem_per_aux = %d MB", blksize, mem_av, mem_per_aux)
+        log.debug2 ("LAS DF ERI naux = %d, nao = %d, nmo = %d", naux, nao, nmo)
+    assert (blksize>1)
+    eri = 0
+    t0 = (lib.logger.process_clock (), lib.logger.perf_counter ())
+    for cderi in las.with_df.loop (blksize=blksize):
+        t1 = lib.logger.timer (las, 'Sparsedf', *t0)
+        naux = cderi.shape[0]
+        eri1 = np.empty((nmo, int(ncas*ncas*(ncas+1)/2)),dtype='d')
+        if DEBUG and gpu:
+            libgpu.libgpu_get_h2eff_df(gpu, cderi, nao, nmo, ncas, naux, ncore,eri1)
+            print('gpu',eri1)
+            bPmu = np.einsum('Pmn,nu->Pmu',lib.unpack_tril(cderi),mo_cas)
+
+            bPvu = np.einsum('mv,Pmu->Pvu',mo_cas.conjugate(),bPmu)
+
+            bumP = bPmu.transpose(2,1,0)
+            buvP = bPvu.transpose(2,1,0)
+            eri2 = np.einsum('uvP,wmP->uvwm', buvP, bumP)
+            eri2 = np.einsum('mM,uvwm->Mwvu', mo_coeff.conjugate(),eri2)
+            print('final matmul',eri2)
+            eri2 = lib.pack_tril (eri2.reshape (nmo*ncas, ncas, ncas)).reshape (nmo, -1)
+            print('cpu',eri2)
+            #bPmn = sparsedf_array (cderi)
+            #bmuP1 = bPmn.contract1 (mo_cas)
+            #bmuP1 = np.einsum('Pmn,nu->Pmu',lib.unpack_tril(bPmn),mo_cas)
+            #bmuP1 = bmuP1.transpose(1,2,0)
+            #buvP = np.tensordot (mo_cas.conjugate (), bmuP1, axes=((0),(0)))
+            #eri2 = np.tensordot (bmuP1, buvP, axes=((2),(2)))
+            #eri2 = np.tensordot (mo_coeff.conjugate (), eri2, axes=((0),(0)))
+            if np.allclose(eri1,eri2): print("h2eff is working")
+            else: print("h2eff not working"); exit()
+        elif gpu: libgpu.libgpu_get_h2eff_df(gpu, cderi, nao, nmo, ncas, naux, ncore,eri1)
+        else: 
+            bPmn = sparsedf_array (cderi)
+            bmuP1 = bPmn.contract1 (mo_cas)
+            #bmuP1 = np.einsum('Pmn,nu->muP',unpack_tril(bPmn),mo_cas)
+            buvP = np.tensordot (mo_cas.conjugate (), bmuP1, axes=((0),(0)))
+            eri1 = np.tensordot (bmuP1, buvP, axes=((2),(2)))
+            eri1 = np.tensordot (mo_coeff.conjugate (), eri1, axes=((0),(0)))
+            eri1 = lib.pack_tril (eri1.reshape (nmo*ncas, ncas, ncas)).reshape (nmo, -1)
+            cderi = bPmn = bmuP1 = buvP = None
+        t1 = lib.logger.timer (las, 'contract1 gpu', *t1)
+        eri +=eri1
+        eri1= None
+    return eri
+
+
+
+
 def get_h2eff (las, mo_coeff=None):
     if mo_coeff is None: mo_coeff = las.mo_coeff
     nao, nmo = mo_coeff.shape
@@ -100,7 +187,8 @@ def get_h2eff (las, mo_coeff=None):
         raise MemoryError ("{} MB of {}/{} MB av/total for ERI array".format (
             mem_eris, mem_remaining, las.max_memory))
     if getattr (las, 'with_df', None) is not None:
-        eri = get_h2eff_df (las, mo_coeff)
+        if las.use_gpu: eri = get_h2eff_df (las,mo_coeff)#the full version is not working yet.
+        else: eri = get_h2eff_df (las, mo_coeff)
     elif getattr (las._scf, '_eri', None) is not None:
         eri = ao2mo.incore.general (las._scf._eri, mo, compact=True)
     else:

diff --git a/my_pyscf/mcscf/lasci.py b/my_pyscf/mcscf/lasci.py
@@ -1956,16 +1956,17 @@ def fast_veffa (self, casdm1s_sub, h2eff_sub, mo_coeff=None, ci=None, _full=Fals
         ncas = sum (ncas_sub)
         nocc = ncore + ncas
         nao, nmo = mo_coeff.shape
-
+        gpu=self.use_gpu
         mo_cas = mo_coeff[:,ncore:nocc]
         moH_cas = mo_cas.conjugate ().T
         moH_coeff = mo_coeff.conjugate ().T
         dma = linalg.block_diag (*[dm[0] for dm in casdm1s_sub])
         dmb = linalg.block_diag (*[dm[1] for dm in casdm1s_sub])
         casdm1s = np.stack ([dma, dmb], axis=0)
-        if not (isinstance (self, _DFLASCI)):
+        if gpu or not (isinstance (self, _DFLASCI)):
             dm1s = np.dot (mo_cas, np.dot (casdm1s, moH_cas)).transpose (1,0,2)
-            return self.get_veff (dm1s = dm1s, spin_sep=True)
+            if not _full: dm1s = dm1s[0]+dm1s[1]
+            return self.get_veff (dm1s = dm1s, spin_sep=_full)
         casdm1 = casdm1s.sum (0)
         dm1 = np.dot (mo_cas, np.dot (casdm1, moH_cas))
         bPmn = sparsedf_array (self.with_df._cderi)

diff --git a/my_pyscf/mcscf/lasci_sync.py b/my_pyscf/mcscf/lasci_sync.py
@@ -8,7 +8,7 @@
 from mrh.my_pyscf.gpu import libgpu
 
 # Setting DEBUG = True will execute both CPU (original) and GPU (new) paths checking for consistency 
-DEBUG = False
+DEBUG = True
 
 if DEBUG:
     import math
@@ -1329,9 +1329,9 @@ def update_mo_ci_eri (self, x, h2eff_sub):
           if(np.allclose(h2eff_sub,h2eff_sub2,atol=1e-13)): print('H2eff test passed')
           else:print('H2eff gpu kernel is not working');print(np.max((h2eff_sub-h2eff_sub2)*(h2eff_sub-h2eff_sub2)));exit()
         elif gpu:
-          h2eff_sub = self._update_h2eff_sub_gpu (gpu, mo1, umat, h2eff_sub,log, t0) 
+          h2eff_sub = self._update_h2eff_sub_gpu (gpu, mo1, umat, h2eff_sub)
         else:
-          h2eff_sub = self._update_h2eff_sub (mo1, umat, h2eff_sub,log,t0) 
+          h2eff_sub = self._update_h2eff_sub (mo1, umat, h2eff_sub)
         t0=log.timer('update_h2eff_sub',*t0)
         return mo1, ci1, h2eff_sub
 
@@ -1357,20 +1357,17 @@ def _update_ci (self, dci):
             ci1.append (ci1_r)
         return ci1
 
-    def _update_h2eff_sub_gpu(self,gpu,mo1,umat,h2eff_sub,log,t0):
+    def _update_h2eff_sub_gpu(self,gpu,mo1,umat,h2eff_sub):
         ncore, ncas, nocc, nmo = self.ncore, self.ncas, self.nocc, self.nmo
         ucas = umat[ncore:nocc, ncore:nocc]
         bmPu = None
         if hasattr (h2eff_sub, 'bmPu'): bmPu = h2eff_sub.bmPu
-        t0 = log.timer('setup',*t0)
         libgpu.libgpu_update_h2eff_sub(gpu,self.ncore,self.ncas,self.nocc,self.nmo,umat, h2eff_sub)
-        t0 = log.timer('libgpu call to h2eff_update',*t0)
         if bmPu is not None:
             bmPu = np.dot (bmPu, ucas)
             h2eff_sub = lib.tag_array (h2eff_sub, bmPu = bmPu)
-        t0 = log.timer('bmPu work',*t0)
-        return h2eff_sub
-
+        return h2eff_sub 
+
     def _update_h2eff_sub_debug(self, mo1, umat, h2eff_sub):
         ncore, ncas, nocc, nmo = self.ncore, self.ncas, self.nocc, self.nmo
         ucas = umat[ncore:nocc, ncore:nocc]
@@ -1388,12 +1385,11 @@ def _update_h2eff_sub_debug(self, mo1, umat, h2eff_sub):
         h2eff_sub = h2eff_sub[:,:,(ix_i*ncas)+ix_j]
         h2eff_sub = h2eff_sub.reshape (nmo, -1)
         return h2eff_sub
-    def _update_h2eff_sub (self, mo1, umat, h2eff_sub, log, t0):
+    def _update_h2eff_sub (self, mo1, umat, h2eff_sub):
         ncore, ncas, nocc, nmo = self.ncore, self.ncas, self.nocc, self.nmo
         ucas = umat[ncore:nocc, ncore:nocc]
         bmPu = None
-        if hasattr (h2eff_sub, 'bmPu'): print('here');bmPu = h2eff_sub.bmPu
-        t0 = log.timer('setup',*t0)
+        if hasattr (h2eff_sub, 'bmPu'):bmPu = h2eff_sub.bmPu
         h2eff_sub = h2eff_sub.reshape (nmo*ncas, ncas*(ncas+1)//2)
         h2eff_sub = lib.numpy_helper.unpack_tril (h2eff_sub)
         h2eff_sub = h2eff_sub.reshape (nmo, ncas, ncas, ncas)
@@ -1405,11 +1401,9 @@ def _update_h2eff_sub (self, mo1, umat, h2eff_sub, log, t0):
         h2eff_sub = h2eff_sub.reshape (nmo, ncas, ncas*ncas)
         h2eff_sub = h2eff_sub[:,:,(ix_i*ncas)+ix_j]
         h2eff_sub = h2eff_sub.reshape (nmo, -1)
-        t0 = log.timer('big math',*t0)
         if bmPu is not None:
             bmPu = np.dot (bmPu, ucas)
             h2eff_sub = lib.tag_array (h2eff_sub, bmPu = bmPu)
-        t0 = log.timer('bmPu work',*t0)
         return h2eff_sub
 
     def get_grad (self):