Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MP2 cleaned up #186

Open
wants to merge 45 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
4e6d2f7
add CMAKE related files
Aug 11, 2020
4634c2f
Merge remote-tracking branch 'upstream/master'
Sep 23, 2020
f0589cc
added modified source files
Oct 3, 2020
dfacd16
add new prints in gpu.cu and gpu_MP2.cu
Oct 5, 2020
e9f76b2
call upload_para_to_const() in cuda/gpu.cu. Y matrix in cuda is fine …
Oct 7, 2020
365ee5f
cuda MP2 works for water_ene_sto3g.in
Oct 13, 2020
14b5006
add gpu_mp2_wrapper
Oct 13, 2020
33a6c52
finished first three transformations of ERI. Works for sto-3g water
Nov 3, 2020
c78b42f
CUDA MP2 implemented with orbmp2i and orbmp2j allocated on each thread
Nov 7, 2020
8f1518b
MP2 Y_Matrix all transformation on CUDA
Nov 13, 2020
2e03a83
migrate entire MP2 to cuda
Nov 18, 2020
6653282
four quarter trans of integral direct method done. before removing Y_…
Dec 3, 2020
ee048b3
removed Y_Matrix
Dec 3, 2020
f55e5cd
cuda mp2 should be done in the output file
Dec 4, 2020
19ad99c
add CMAKE related files
Aug 11, 2020
8f714cc
added modified source files
Oct 3, 2020
9e0122a
add new prints in gpu.cu and gpu_MP2.cu
Oct 5, 2020
bfc85d9
call upload_para_to_const() in cuda/gpu.cu. Y matrix in cuda is fine …
Oct 7, 2020
f8ed32c
cuda MP2 works for water_ene_sto3g.in
Oct 13, 2020
beda42a
add gpu_mp2_wrapper
Oct 13, 2020
7b61233
finished first three transformations of ERI. Works for sto-3g water
Nov 3, 2020
af9f4ff
CUDA MP2 implemented with orbmp2i and orbmp2j allocated on each thread
Nov 7, 2020
8f40c6e
MP2 Y_Matrix all transformation on CUDA
Nov 13, 2020
c5908cf
migrate entire MP2 to cuda
Nov 18, 2020
bc69d2d
four quarter trans of integral direct method done. before removing Y_…
Dec 3, 2020
0201be8
removed Y_Matrix
Dec 3, 2020
57c08fb
cuda mp2 should be done in the output file
Dec 4, 2020
7e3887f
revert bCUDA in src/modules/quick_method_module.f90
Dec 4, 2020
cba6099
commit all fixed conflicts
Dec 4, 2020
a0cf745
fixed conflicts again
Dec 4, 2020
b57946b
fix conflicts again
Dec 4, 2020
88f8b92
fix effect integral for serial
Dec 8, 2020
f04e0f6
CUDA MP2 cleaned up
Dec 10, 2020
796d1ae
to replace the remote cleaned_up branch which merged 2-3 and 4-accu
Feb 17, 2021
2100af6
fix comments for gpu mp2
Mar 21, 2021
608c6dd
removed old CMake files
Mar 24, 2021
9d56b56
comment on src/scf_operator.f90
Mar 24, 2021
56fb3ee
resolve merge conflicts
Mar 24, 2021
05b505d
fix BLYP in gpu_MP2.cu
Mar 24, 2021
e492a4e
added frozen core to serial MP2
Apr 14, 2021
56390d8
Fully implemented frozencore for both serial and CUDA MP2
Apr 16, 2021
d34349c
frozencore memory reduction
May 18, 2021
28298a5
test if token works
May 18, 2021
604eef9
deleted test_token.txt
May 18, 2021
aa0df12
edit for cuda/10.1.243
Jun 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/basis.f90
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ subroutine readbasis(natomxiao,natomstart,natomfinal,nbasisstart,nbasisfinal,ier
! * Allocate arrays whose dimensions depend on NATOM (allocateatoms_ecp)
! * Read the Effective Core Potentials (ECPs), modify the atomic charges
! and the total number of electrons (readecp)
print *, "call readbasis"
if (quick_method%ecp) call readecp
call quick_open(ibasisfile,basisfilename,'O','F','W',.true.,ierr)
CHECK_ERROR(ierr)
Expand All @@ -56,6 +57,7 @@ subroutine readbasis(natomxiao,natomstart,natomfinal,nbasisstart,nbasisfinal,ier
atmbs2=.true.
icont=0
quick_method%ffunxiao=.true.
nfrozencore = 0

! parse the file and find the sizes of things to allocate them in memory
do while (iofile == 0 )
Expand Down Expand Up @@ -227,6 +229,9 @@ subroutine readbasis(natomxiao,natomstart,natomfinal,nbasisstart,nbasisfinal,ier
nshell = nshell + quick_basis%kshell(quick_molspec%iattype(i))
nbasis = nbasis + kbasis(quick_molspec%iattype(i))
nprim = nprim + kcontract(quick_molspec%iattype(i))
if(quick_method%frzCore)then
nfrozencore = nfrozencore + frozencore(quick_molspec%iattype(i))/2
endif

! MFCC
if(i.eq.natomfinal)nbasisfinal=nbasis
Expand Down
135 changes: 110 additions & 25 deletions src/calMP2.f90
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,41 @@ subroutine calmp2
use quick_cutoff_module, only: cshell_density_cutoff
implicit double precision(a-h,o-z)

double precision cutoffTest,testtmp,testCutoff
integer II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2
double precision cutoffTest,testtmp,testCutoff,gpuMP2WrapperTimeStart, gpuMP2WrapperTimeEnd
double precision :: gpuMp2cor(1),gpuEmemorysum(1)
integer II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2,NONZEROCOUNT,gpuNstepmp2(1)
integer:: gpuNtemp(1)
common /hrrstore/II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2
integer :: nelec,nelecb
integer :: nelec,nelecb

if(.not. quick_method%frzCore)then
print *, "Do not use fc approximation!"
endif
print *, "nfrozencore is ", nfrozencore


#ifdef CUDA
if(quick_method%bCUDA) then

call PrtAct(ioutfile,"Begin MP2 Calculation")

call cpu_time(gpuMP2WrapperTimeStart)
call gpu_mp2_wrapper(quick_qm_struct%co,quick_qm_struct%vec,quick_qm_struct%dense,quick_qm_struct%E,&
cutmatrix,quick_method%integralCutoff,quick_method%primLimit,quick_method%DMCutoff,gpuMp2cor,&
gpuEmemorysum, gpuNstepmp2, gpuNtemp)
call cpu_time(gpuMP2WrapperTimeEnd)

write(ioutfile,'("CURRENT MEMORY USAGE=",E12.6,"M")') gpuEmemorysum(1)
write(ioutfile,'("TOTAL STEP =",I6)') gpuNstepmp2(1)
write(ioutfile,'("EFFECT INTEGRALS =",i8)') gpuNtemp(1)

quick_qm_struct%EMP2 = gpuMp2cor(1)
print *, 'cuda calculated mp2cor is', quick_qm_struct%EMP2
timer_cumer%TMP2 = gpuMP2WrapperTimeEnd-gpuMP2WrapperTimeStart
print '("Total GPU MP2 Wrapper Time = ",f6.3," seconds.")', timer_cumer%TMP2

endif
#else
nelec = quick_molspec%nelec
nelecb = quick_molspec%nelecb

Expand All @@ -23,34 +53,53 @@ subroutine calmp2
quick_qm_struct%EMP2=0.0d0

! occupied and virtual orbitals number
iocc=Nelec/2
!change here
!iocc=Nelec/2
iocc=Nelec/2-nfrozencore
ivir=Nbasis-Nelec/2

! calculate memory usage and determine steps
ememorysum=real(iocc*ivir*nbasis*8.0d0/1024.0d0/1024.0d0/1024.0d0)

! actually nstep is step length
nstep=min(int(1.5d0/ememorysum),Nelec/2)

! set max mem req for Q3 to larger values
reqmemmax = 1.5d0
!change here
!nstep=min(int(reqmemmax/ememorysum),Nelec/2-nfrozencore)
nstep=min(int(reqmemmax/ememorysum),iocc)
if(nstep<1)then
nstep=1
endif

! if with f orbital
if(quick_method%ffunxiao)then
nbasistemp=6
else
nbasistemp=10
endif

print *, "in calmp2, nstep is", nstep
print *, "in calmp2, reqmemmax is", reqmemmax,&
"nbasistemp is", nbasistemp

! Allocate some variables
allocate(mp2shell(nbasis))
allocate(orbmp2(ivir,ivir))
allocate(orbmp2i331(nstep,nbasis,nbasistemp,nbasistemp,2))
allocate(orbmp2j331(nstep,ivir,nbasistemp,nbasistemp,2))
!change here
!allocate(orbmp2k331(nstep,iocc,ivir,nbasis))
allocate(orbmp2k331(nstep,iocc,ivir,nbasis))

! with nstep(acutally, it represetns step lenght), we can
! with nstep(acutally, it represetns step lenght), we can
! have no. of steps for mp2 calculation
nstepmp2=nelec/2/nstep
!change here
!nstepmp2=(nelec/2-nfrozencore)/nstep
nstepmp2=iocc/nstep
nstepmp2=nstepmp2+1
if(nstep*(nstepmp2-1).eq.nelec/2)then
!change here
!if(nstep*(nstepmp2-1).eq.(nelec/2-nfrozencore))then
if(nstep*(nstepmp2-1).eq.iocc)then
nstepmp2=nstepmp2-1
endif

Expand All @@ -66,17 +115,28 @@ subroutine calmp2

ttt=MAXVAL(Ycutoff) ! Max Value of Ycutoff

print *, "job step is ", nstepmp2
print *, "nstep is ", nstep
call flush(6)

do i3new=1,nstepmp2 ! Step counter
print *, "i3new is ", i3new
call flush(6)

call cpu_time(timer_begin%TMP2)
ntemp=0 ! integer counter
nstepmp2s=(i3new-1)*nstep+1 ! Step start n
nstepmp2f=i3new*nstep ! Step end n
nstepmp2s=(i3new-1)*nstep+1+nfrozencore ! Step start n
nstepmp2f=i3new*nstep+nfrozencore ! Step end n

if(i3new.eq.nstepmp2)nstepmp2f=nelec/2
nsteplength=nstepmp2f-nstepmp2s+1 ! Step Lengh, from nstepmp2s to nstepmp2f

print *, "nstepmp2s is ", nstepmp2s
print *, "nstepmp2f is ", nstepmp2f

! Initial orbmp2k331
! change_here
!call initialOrbmp2k331(orbmp2k331,nstep,nbasis,ivir,iocc,nsteplength)
call initialOrbmp2k331(orbmp2k331,nstep,nbasis,ivir,iocc,nsteplength)
do II=1,jshell
do JJ=II,jshell
Expand Down Expand Up @@ -116,21 +176,20 @@ subroutine calmp2
enddo
enddo
enddo

testCutoff=testCutoff*comax
if(testCutoff.gt.cutoffmp2)then
dnmax=comax
ntemp=ntemp+1
call shellmp2(nstepmp2s,nsteplength)
endif

call shellmp2(nstepmp2s,nsteplength)
endif
endif

enddo
enddo



NII1=quick_basis%Qstart(II)
NII2=quick_basis%Qfinal(II)
NJJ1=quick_basis%Qstart(JJ)
Expand Down Expand Up @@ -168,9 +227,10 @@ subroutine calmp2
enddo

do j33=1,ivir
do k33=1,nelec/2
atemp=quick_scratch%hold(k33,III)
atemp2=quick_scratch%hold(k33,JJJ)
!do k33=1+nfrozencore,nelec/2
do k33=1,iocc
atemp=quick_scratch%hold(k33+nfrozencore,III)
atemp2=quick_scratch%hold(k33+nfrozencore,JJJ)
do icycle=1,nsteplength
orbmp2k331(icycle,k33,j33,JJJ)=orbmp2k331(icycle,k33,j33,JJJ)+ &
orbmp2j331(icycle,j33,IIInew,JJJnew,1)*atemp
Expand Down Expand Up @@ -203,7 +263,8 @@ subroutine calmp2

do icycle=1,nsteplength
i3=nstepmp2s+icycle-1
do k3=i3,nelec/2
!do k3=i3,nelec/2
do k3=i3-nfrozencore, iocc

do J3=1,nbasis-nelec/2
do L3=1,nbasis-nelec/2
Expand All @@ -217,13 +278,15 @@ subroutine calmp2

do J3=1,nbasis-nelec/2
do L3=1,nbasis-nelec/2
if(k3.gt.i3)then
quick_qm_struct%EMP2=quick_qm_struct%EMP2+2.0d0/(quick_qm_struct%E(i3)+quick_qm_struct%E(k3) &
if(k3.gt.i3-nfrozencore)then
quick_qm_struct%EMP2=quick_qm_struct%EMP2+2.0d0/(quick_qm_struct%E(i3) &
+quick_qm_struct%E(k3+nfrozencore) &
-quick_qm_struct%E(j3+nelec/2)-quick_qm_struct%E(l3+nelec/2)) &
*orbmp2(j3,l3)*(2.0d0*orbmp2(j3,l3)-orbmp2(l3,j3))
endif
if(k3.eq.i3)then
quick_qm_struct%EMP2=quick_qm_struct%EMP2+1.0d0/(quick_qm_struct%E(i3)+quick_qm_struct%E(k3) &
if(k3.eq.i3-nfrozencore)then
quick_qm_struct%EMP2=quick_qm_struct%EMP2+1.0d0/(quick_qm_struct%E(i3) &
+quick_qm_struct%E(k3+nfrozencore) &
-quick_qm_struct%E(j3+nelec/2)-quick_qm_struct%E(l3+nelec/2)) &
*orbmp2(j3,l3)*(2.0d0*orbmp2(j3,l3)-orbmp2(l3,j3))
endif
Expand All @@ -233,10 +296,14 @@ subroutine calmp2
enddo
enddo


print *, "cleaned up!"
call cpu_time(timer_end%TMP2)
timer_cumer%TMP2=timer_end%TMP2-timer_begin%TMP2+timer_cumer%TMP2

enddo
write (ioutfile,'("EFFECT INTEGRALS =",i8)') ntemp
#endif

write (iOutFile,'("SECOND ORDER ENERGY =",F16.9)') quick_qm_struct%EMP2
write (iOutFile,'("EMP2 =",F16.9)') quick_qm_struct%Etot+quick_qm_struct%EMP2
Expand Down Expand Up @@ -392,7 +459,8 @@ subroutine MPI_calmp2
if(testCutoff.gt.cutoffmp2)then
dnmax=comax
ntemp=ntemp+1
call shellmp2(nstepmp2s,nsteplength)
!call shellmp2(nstepmp2s,nsteplength,Y_Matrix)
call shellmp2(nstepmp2s,nsteplength)
endif

endif
Expand Down Expand Up @@ -971,3 +1039,20 @@ subroutine initialOrbmp2ij(orbmp2i331,nstep,nsteplength,nbasis,nbasistemp,nbasis
enddo
enddo
end subroutine initialOrbmp2ij

subroutine print_matrix(matrix,n)
integer n, i, j
double precision, dimension(n,n),intent(in) :: matrix

do i=1, n
do j=1, n
write(*, '(f16.8)', advance='no'), matrix(i,j)
!write(*, '(f20.6)'), matrix(i,j)
end do
write(*,'(" ")')
end do

end subroutine print_matrix



2 changes: 1 addition & 1 deletion src/cuda/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ include $(MAKEIN)

LIBXC_CUDA_FLAGS = -I$(libxcfolder)

CUDACOBJ=$(objfolder)/gpu.o $(objfolder)/gpu_type.o $(objfolder)/gpu_get2e.o $(objfolder)/gpu_MP2.o
CXXOBJ=$(objfolder)/xc_redistribute.o

CUDACOBJ=$(objfolder)/gpu.o $(objfolder)/gpu_type.o $(objfolder)/gpu_get2e.o

CUDAXCOBJ=$(objfolder)/gpu_getxc.o

Expand Down
Loading