Skip to content

Commit

Permalink
Added cabsv_kernel_v512_4x16_ps .
Browse files Browse the repository at this point in the history
  • Loading branch information
bgin authored Nov 2, 2024
1 parent 436c6dd commit f8c981a
Showing 1 changed file with 145 additions and 1 deletion.
146 changes: 145 additions & 1 deletion Mathematics/GMS_cabs_vec_zmm16r4.f90
Original file line number Diff line number Diff line change
Expand Up @@ -827,7 +827,151 @@ subroutine cabsv_kernel_v512_8x16_ps(xre,xim,cabs,n)
end subroutine cabsv_kernel_v512_8x16_ps



subroutine cabsv_kernel_v512_4x16_ps(xre,xim,cabs,n)
#if defined(__ICC) || defined(__INTEL_COMPILER)
!DIR$ ATTRIBUTES CODE_ALIGN : 32 :: cabsv_kernel_v512_4x16_ps
!DIR$ OPTIMIZE : 3
!DIR$ ATTRIBUTES OPTIMIZATION_PARAMETER: TARGET_ARCH=skylake_avx512 :: cabsv_kernel_v512_4x16_ps
#endif
real(kind=sp), allocatable, dimension(:), intent(in) :: xre
real(kind=sp), allocatable, dimension(:), intent(in) :: xim
real(kind=sp), allocatable, dimension(:), intent(out) :: cabs
integer(i4), intent(in) :: n

type(ZMM16r4_t), automatic :: zmm0
type(ZMM16r4_t), automatic :: zmm1
type(ZMM16r4_t), automatic :: zmm2
type(ZMM16r4_t), automatic :: zmm3
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm0
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm1
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm2
!DIR$ ATTRIBUTES ALIGN : 64 :: zmm3
type(XMM4r4_t), automatic :: xmm0
type(XMM4r4_t), automatic :: xmm1
type(XMM4r4_t), automatic :: xmm2
type(XMM4r4_t), automatic :: xmm3
type(YMM8r4_t), automatic :: ymm0
type(YMM8r4_t), automatic :: ymm1
type(YMM8r4_t), automatic :: ymm2
type(YMM8r4_t), automatic :: ymm3
real(sp), automatic :: xr
real(sp), automatic :: xi
real(sp), automatic :: re2
real(sp), automatic :: im2
integer(i4), automatic :: i,ii,j
integer(i4), automatic :: idx1,idx2,idx3

if(n<=0) then
return
else if(n==1) then
xr = xre(0)
re2 = xr*xr
xi = xim(0)
im2 = xi*xi
cabs(0) = sqrt(re2+im2)
return
else if(n>1 && n<=4) then
!$omp simd linear(i:1)
do i=0, 3
xmm0.v(i) = xre(i)
xmm1.v(i) = xmm0.v(i)*xmm0.v(i)
xmm2.v(i) = xim(i)
xmm3.v(i) = xmm2.v(i)*xmm2.v(i)
cabs(i) = sqrt(xmm1.v(i)+xmm3.v(i))
end do
return
else if(n>4 && n<=8) then
!$omp simd linear(i:1)
do i=0, 7
ymm0.v(i) = xre(i)
ymm1.v(i) = ymm0.v(i)*ymm0.v(i)
ymm2.v(i) = xim(i)
ymm3.v(i) = ymm2.v(i)*ymm2.v(i)
cabs(i) = sqrt(ymm1.v(i)+ymm3.v(i))
end do
return
else if(n>8 && n<=16) then
!$omp simd linear(i:1)
do i=0, 15
zmm0.v(i) = xre(i)
zmm1.v(i) = zmm0.v(i)*zmm0.v(i)
zmm2.v(i) = xim(i)
zmm3.v(i) = zmm2.v(i)*zmm2.v(i)
cabs(i) = sqrt(zmm1.v(i)+zmm3.v(i))
end do
return
else if(n>16 && n<=32) then
do i = 0,iand(n-1,inot(15)),16
!$omp simd aligned(xim:64,xre,cabs) linear(ii:1)
do ii = 0, 15
zmm0.v(ii) = xre(i+ii)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(i+ii)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
cabs(i+ii) = sqrt(zmm1.v(ii)+zmm3.v(ii))
end do
end do
#if defined(__ICC) || defined(__INTEL_COMPILER)
!DIR$ LOOP COUNT MAX=16, MIN=1, AVG=8
#endif
do j = i, n-1
xr = xre(j)
re2 = xr*xr
xi = xim(j)
im2 = xi*xi
cabs(j) = sqrt(re2+im2)
end do
return
else if(n>32) then
do i=0, iand(n-1,inot(ZMM_LEN-1)), ZMM_LEN*4
call mm_prefetch(xre(i+4*ZMM_LEN),FOR_K_PREFETCH_T1)
call mm_prefetch(xim(i+4*ZMM_LEN),FOR_K_PREFETCH_T1)
#if defined(__ICC) || defined(__INTEL_COMPILER)
!dir$ assume_aligned xre:64
!dir$ assume_aligned xim:64
!dir$ assume_aligned cabs:64

#endif
!$omp simd aligned(xim:64,xre,cabs) linear(ii:1)
do ii = 0, ZMM_LEN-1
zmm0.v(ii) = xre(i+0+ii)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(i+0+ii)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
cabs(i+0+ii)= sqrt(zmm1.v(ii)+zmm3.v(ii))
idx1 = i+1*ZMM_LEN+ii
zmm0.v(ii) = xre(idx1)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx1)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
cabs(idx1)= sqrt(zmm1.v(ii)+zmm3.v(ii))
idx2 = i+2*ZMM_LEN+ii
zmm0.v(ii) = xre(idx2)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx2)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
cabs(idx2)= sqrt(zmm1.v(ii)+zmm3.v(ii))
idx3 = i+3*ZMM_LEN+ii
zmm0.v(ii) = xre(idx3)
zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii)
zmm2.v(ii) = xim(idx3)
zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii)
cabs(idx3)= sqrt(zmm1.v(ii)+zmm3.v(ii))
end do
end do
#if defined(__ICC) || defined(__INTEL_COMPILER)
!DIR$ LOOP COUNT MAX=16, MIN=1, AVG=8
#endif
do j = i, n-1
xr = xre(j)
re2 = xr*xr
xi = xim(j)
im2 = xi*xi
cabs(j) = sqrt(re2+im2)
end do
return
end if
end subroutine cabsv_kernel_v512_4x16_ps



Expand Down

0 comments on commit f8c981a

Please sign in to comment.