Skip to content

Commit

Permalink
Added cpowv_v512_8x16_ps kernel.
Browse files Browse the repository at this point in the history
  • Loading branch information
bgin authored Nov 2, 2024
1 parent 6b3adf6 commit 2e9c536
Showing 1 changed file with 328 additions and 0 deletions.
328 changes: 328 additions & 0 deletions Mathematics/GMS_cpow_vec_zmm16r4.f90
Original file line number Diff line number Diff line change
Expand Up @@ -1224,6 +1224,334 @@ subroutine cpowv_v512_16x16_ps(xre,xim,vn,cpowr,cpowi,n)
return
end if
end subroutine cpowv_v512_16x16_ps


subroutine cpowv_v512_8x16_ps(xre,xim,vn,cpowr,cpowi,n)
#if defined(__ICC) || defined(__INTEL_COMPILER)
!DIR$ ATTRIBUTES CODE_ALIGN : 32 :: cpowv_v512_8x16_ps
!DIR$ OPTIMIZE : 3
!DIR$ ATTRIBUTES OPTIMIZATION_PARAMETER: TARGET_ARCH=skylake_avx512 :: cpowv_v512_8x16_ps
#endif
real(kind=sp), allocatable, dimension(:), intent(in) :: xre
real(kind=sp), allocatable, dimension(:), intent(in) :: xim
real(kind=sp), allocatable, dimension(:), intent(in) :: vn
real(kind=sp), allocatable, dimension(:), intent(out) :: cpowr
real(kind=sp), allocatable, dimension(:), intent(out) :: cpowi
integer(kind=i4), intent(in) :: n

type(ZMM16r4_t), automatic :: zmm0
type(ZMM16r4_t), automatic :: zmm1
type(ZMM16r4_t), automatic :: zmm2
type(ZMM16r4_t), automatic :: zmm3
type(ZMM16r4_t), automatic :: zmm4
type(ZMM16r4_t), automatic :: zmm5
type(ZMM16r4_t), automatic :: zmm6
type(ZMM16r4_t), automatic :: zmm7
type(ZMM16r4_t), automatic :: zmm8
type(ZMM16r4_t), automatic :: zmm9
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm0
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm1
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm2
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm3
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm4
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm5
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm6
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm7
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm8
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm9
type(XMM4r4_t), automatic :: xmm0
type(XMM4r4_t), automatic :: xmm1
type(XMM4r4_t), automatic :: xmm2
type(XMM4r4_t), automatic :: xmm3
type(XMM4r4_t), automatic :: xmm4
type(XMM4r4_t), automatic :: xmm5
type(XMM4r4_t), automatic :: xmm6
type(XMM4r4_t), automatic :: xmm7
type(XMM4r4_t), automatic :: xmm8
type(XMM4r4_t), automatic :: xmm9
type(YMM8r4_t), automatic :: ymm0
type(YMM8r4_t), automatic :: ymm1
type(YMM8r4_t), automatic :: ymm2
type(YMM8r4_t), automatic :: ymm3
type(YMM8r4_t), automatic :: ymm4
type(YMM8r4_t), automatic :: ymm5
type(YMM8r4_t), automatic :: ymm6
type(YMM8r4_t), automatic :: ymm7
type(YMM8r4_t), automatic :: ymm8
type(YMM8r4_t), automatic :: ymm9
real(sp), automatic :: z0
real(sp), automatic :: z1
real(sp), automatic :: z2
real(sp), automatic :: z3
real(sp), automatic :: z4
real(sp), automatic :: z5
real(sp), automatic :: z6
real(sp), automatic :: z7
real(sp), automatic :: z8
real(sp), automatic :: z9
real(sp), automatic :: zx
integer(i4), automatic :: i,ii,j
integer(i4), automatic :: idx1,idx2,idx3,idx4
integer(i4), automatic :: idx5,idx6,idx7

if(n<=0) then
return
else if(n==1) then
z0 = xre(0)
z1 = z0*z0
z2 = xim(0)
z3 = z2*z2
z4 = sqrt(z1+z3)
zx = vn(0)
z5 = atan(z2/z0)
z6 = pow(z4,zx)
z7 = zx*z5
z8 = z6*cos(z7)
cpowr(0) = z8
z9 = z6*sin(z7)
cpowi(0) = z9
return
else if(n>1 && n<=4)
!$omp simd linear(i:1)
do i=0, 3
xmm0.v(i) = xre(i)
xmm1.v(i) = xmm0.v(i)*xmm0.v(i)
xmm2.v(i) = xim(i)
xmm3.v(i) = xmm2.v(i)*xmm2.v(i)
xmm4.v(i) = sqrt(xmm1.v(i)+xmm3.v(i))
xmm9.v(i) = vn(i)
xmm5.v(i) = atan(xmm2.v(i)/xmm0.v(i))
xmm6.v(i) = pow(xmm4.v(i),xmm9.v(i))
xmm7.v(i) = xmm9.v(i)*xmm5.v(i)
xmm8.v(i) = xmm6.v(i)*cos(xmm7.v(i))
cpowr(i) = xmm8.v(i)
xmm9.v(i) = xmm6.v(i)*sin(xmm7.v(i))
cpowi(i) = xmm9.v(i)
end do
return
else if(n>4 && n<=8) then
!$omp simd linear(i:1)
do i=0, 7
ymm0.v(i) = xre(i)
ymm1.v(i) = ymm0.v(i)*ymm0.v(i)
ymm2.v(i) = xim(i)
ymm3.v(i) = ymm2.v(i)*ymm2.v(i)
ymm4.v(i) = sqrt(ymm1.v(i)+ymm3.v(i))
ymm9.v(i) = vn(i)
ymm5.v(i) = atan(ymm2.v(i)/ymm0.v(i))
ymm6.v(i) = pow(ymm4.v(i),ymm9.v(i))
ymm7.v(i) = ymm9.v(i)*ymm5.v(i)
ymm8.v(i) = ymm6.v(i)*cos(ymm7.v(i))
cpowr(i) = ymm8.v(i)
ymm9.v(i) = ymm6.v(i)*sin(ymm7.v(i))
cpowi(i) = ymm9.v(i)
end do
return
else if(n>8 && n<=16) then
!$omp simd linear(i:1)
do i=0, 15
zmm0.v(i) = xre(i)
zmm1.v(i) = zmm0.v(i)*zmm0.v(i)
zmm2.v(i) = xim(i)
zmm3.v(i) = zmm2.v(i)*zmm2.v(i)
zmm4.v(i) = sqrt(zmm1.v(i)+zmm3.v(i))
zmm9.v(i) = vn(i)
zmm5.v(i) = atan(zmm2.v(i)/zmm0.v(i))
zmm6.v(i) = pow(zmm4.v(i),zmm9.v(i))
zmm7.v(i) = zmm9.v(i)*zmm5.v(i)
zmm8.v(i) = zmm6.v(i)*cos(zmm7.v(i))
cpowr(i) = zmm8.v(i)
zmm9.v(i) = zmm6.v(i)*sin(zmm7.v(i))
cpowi(i) = zmm9.v(i)
end do
return
else if(n>16 && n<=32) then
do i = 0,iand(n-1,inot(15)),16
!$omp simd aligned(xim:64,xre,vn,cpowr,cpowi) linear(ii:1)
do ii = 0, 15
zmm0.v(ii) = xre(i+ii)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(i+ii)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
zmm9.v(ii) = vn(i+ii)
zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii))
zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii))
zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii)
zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii))
cpowr(i+ii) = zmm8.v(ii)
zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii))
cpowi(i+ii) = zmm9.v(ii)
end do
end do
#if defined(__ICC) || defined(__INTEL_COMPILER)
!DIR$ LOOP COUNT MAX=16, MIN=1, AVG=8
#endif
do j = i, n-1
z0 = xre(j)
z1 = z0*z0
z2 = xim(j)
z3 = z2*z2
z4 = sqrt(z1+z3)
zx = vn(j)
z5 = atan(z2/z0)
z6 = pow(z4,zx)
z7 = zx*z5
z8 = z6*cos(z7)
cpowr(j) = z8
z9 = z6*sin(z7)
cpowi(j) = z9
end do
return
else if(n>32) then
do i=0, iand(n-1,inot(ZMM_LEN-1)), ZMM_LEN*8
call mm_prefetch(xre(i+8*ZMM_LEN),FOR_K_PREFETCH_T1)
call mm_prefetch(xim(i+8*ZMM_LEN),FOR_K_PREFETCH_T1)
call mm_prefetch(vn(i+8*ZMM_LEN),FOR_K_PREFETCH_T1)
#if defined(__ICC) || defined(__INTEL_COMPILER)
!dir$ assume_aligned xre:64
!dir$ assume_aligned xim:64
!dir$ assume_aligned vn:64
!dir$ assume_aligned cpowr:64
!dir$ assume_aligned cpowi:64
#endif
!$omp simd aligned(xim:64,xre,vn,cpowr,cpowi) linear(ii:1)
do ii = 0, ZMM_LEN-1
zmm0.v(ii) = xre(i+0+ii)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(i+0+ii)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
zmm9.v(ii) = vn(i+0+ii)
zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii))
zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii))
zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii)
zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii))
cpowr(i+0+ii) = zmm8.v(ii)
zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii))
cpowi(i+0+ii) = zmm9.v(ii)
idx1 = i+1*ZMM_LEN+ii
zmm0.v(ii) = xre(idx1)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx1)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
zmm9.v(ii) = vn(idx1)
zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii))
zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii))
zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii)
zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii))
cpowr(idx1) = zmm8.v(ii)
zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii))
cpowi(idx1) = zmm9.v(ii)
idx2 = i+2*ZMM_LEN+ii
zmm0.v(ii) = xre(idx2)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx2)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
zmm9.v(ii) = vn(idx2)
zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii))
zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii))
zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii)
zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii))
cpowr(idx2) = zmm8.v(ii)
zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii))
cpowi(idx2) = zmm9.v(ii)
idx3 = i+3*ZMM_LEN+ii
zmm0.v(ii) = xre(idx3)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx3)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
zmm9.v(ii) = vn(idx3)
zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii))
zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii))
zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii)
zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii))
cpowr(idx3) = zmm8.v(ii)
zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii))
cpowi(idx3) = zmm9.v(ii)
idx4 = i+4*ZMM_LEN+ii
zmm0.v(ii) = xre(idx4)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx4)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
zmm9.v(ii) = vn(idx4)
zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii))
zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii))
zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii)
zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii))
cpowr(idx4) = zmm8.v(ii)
zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii))
cpowi(idx4) = zmm9.v(ii)
idx5 = i+5*ZMM_LEN+ii
zmm0.v(ii) = xre(idx5)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx5)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
zmm9.v(ii) = vn(idx5)
zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii))
zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii))
zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii)
zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii))
cpowr(idx5) = zmm8.v(ii)
zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii))
cpowi(idx5) = zmm9.v(ii)
idx6 = i+6*ZMM_LEN+ii
zmm0.v(ii) = xre(idx6)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx6)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
zmm9.v(ii) = vn(idx6)
zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii))
zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii))
zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii)
zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii))
cpowr(idx6) = zmm8.v(ii)
zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii))
cpowi(idx6) = zmm9.v(ii)
idx7 = i+7*ZMM_LEN+ii
zmm0.v(ii) = xre(idx7)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx7)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
zmm9.v(ii) = vn(idx7)
zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii))
zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii))
zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii)
zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii))
cpowr(idx7) = zmm8.v(ii)
zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii))
cpowi(idx7) = zmm9.v(ii)
end do
end do
#if defined(__ICC) || defined(__INTEL_COMPILER)
!DIR$ LOOP COUNT MAX=16, MIN=1, AVG=8
#endif
do j = i, n-1
z0 = xre(j)
z1 = z0*z0
z2 = xim(j)
z3 = z2*z2
z4 = sqrt(z1+z3)
zx = vn(j)
z5 = atan(z2/z0)
z6 = pow(z4,zx)
z7 = zx*z5
z8 = z6*cos(z7)
cpowr(j) = z8
z9 = z6*sin(z7)
cpowi(j) = z9
end do
return
end if
end subroutine cpowv_v512_8x16_ps




Expand Down

0 comments on commit 2e9c536

Please sign in to comment.