diff --git a/gpu/mini-apps/ao2mo/runner.py b/gpu/mini-apps/ao2mo/runner.py index f9e50b51..9527f2b0 100644 --- a/gpu/mini-apps/ao2mo/runner.py +++ b/gpu/mini-apps/ao2mo/runner.py @@ -102,7 +102,7 @@ def init_eri_gpu_v1 (mo, casscf, with_df): libgpu.libgpu_pull_jk_ao2mo (gpu, j_pc, k_cp, nmo, ncore) print(naoaux*nmo*nmo) print(naoaux*nmo*ncas) - libgpu.libgpu_pull_ints_ao2mo(gpu, fxpp, bufpa, naoaux, nmo, ncas) + libgpu.libgpu_pull_ints_ao2mo(gpu, fxpp, bufpa, blksize, naoaux, nmo, ncas) k_pc = k_cp.T.copy() print("finishing v1") return fxpp,bufpa, j_pc, k_pc diff --git a/gpu/src/device.h b/gpu/src/device.h index 22683165..7ca5499d 100644 --- a/gpu/src/device.h +++ b/gpu/src/device.h @@ -94,7 +94,7 @@ public : int, size_t); void pull_jk_ao2mo (py::array_t,py::array_t,int, int); - void pull_ints_ao2mo (py::array_t,py::array_t,int, int, int); + void pull_ints_ao2mo (py::array_t,py::array_t, int, int, int, int); void orbital_response(py::array_t, py::array_t, py::array_t, py::array_t, diff --git a/gpu/src/device_cuda.cpp b/gpu/src/device_cuda.cpp index 5e336e7a..161ed2cf 100644 --- a/gpu/src/device_cuda.cpp +++ b/gpu/src/device_cuda.cpp @@ -286,12 +286,32 @@ void Device::pull_jk_ao2mo(py::array_t _j_pc, py::array_t _k_pc, } } /* ---------------------------------------------------------------------- */ -void Device::pull_ints_ao2mo(py::array_t _fxpp, py::array_t _bufpa, int naoaux, int nmo, int ncas) +void Device::pull_ints_ao2mo(py::array_t _fxpp, py::array_t _bufpa, int blksize, int naoaux, int nmo, int ncas) { py::buffer_info info_fxpp = _fxpp.request(); //3D array (nmo*nmo*naoaux) double * fxpp = static_cast(info_fxpp.ptr); printf("size_fxpp %i\n", size_fxpp); - std::memcpy(fxpp, pin_fxpp, size_fxpp*sizeof(double)); + + int count = 0; + int k = 0; + + // naive version to start; we can make this faster + while(k < naoaux) { + int size_vector = (naoaux-k > blksize) ? blksize : naoaux-k; // transfer whole blksize or last subset? + + printf("k= %i size_vector= %i\n",k,size_vector); + for (int i=0; i(info_bufpa.ptr); printf("size_bufpa %i\n", size_bufpa); diff --git a/gpu/src/libgpu.cpp b/gpu/src/libgpu.cpp index 95fb8c6a..2ea075f7 100644 --- a/gpu/src/libgpu.cpp +++ b/gpu/src/libgpu.cpp @@ -191,10 +191,10 @@ void libgpu_pull_jk_ao2mo(void * ptr, } /* ---------------------------------------------------------------------- */ void libgpu_pull_ints_ao2mo(void * ptr, - py::array_t fxpp, py::array_t bufpa, int naoaux, int nmo, int ncas) + py::array_t fxpp, py::array_t bufpa, int blksize, int naoaux, int nmo, int ncas) { Device * dev = (Device *) ptr; - dev->pull_ints_ao2mo(fxpp, bufpa, naoaux, nmo, ncas); + dev->pull_ints_ao2mo(fxpp, bufpa, blksize, naoaux, nmo, ncas); } /* ---------------------------------------------------------------------- */ void libgpu_orbital_response(void * ptr, diff --git a/gpu/src/libgpu.h b/gpu/src/libgpu.h index 9fda2e2f..5518301a 100644 --- a/gpu/src/libgpu.h +++ b/gpu/src/libgpu.h @@ -58,7 +58,7 @@ extern "C" void libgpu_pull_jk_ao2mo(void *, py::array_t, py::array_t,int, int); void libgpu_pull_ints_ao2mo(void *, - py::array_t, py::array_t,int, int, int); + py::array_t, py::array_t, int, int, int, int); void libgpu_orbital_response(void *, py::array_t,