From 24c4ff27da9fec8228d5d81444d4c58dff99ee7f Mon Sep 17 00:00:00 2001 From: Bernard Gingold Date: Sun, 3 Nov 2024 15:47:53 +0100 Subject: [PATCH] Added `cdivv_kernel_v512_16x16_ps ` --- Mathematics/GMS_cdiv_vec_zmm16r4.f90 | 417 +++++++++++++++++++++++++++ 1 file changed, 417 insertions(+) diff --git a/Mathematics/GMS_cdiv_vec_zmm16r4.f90 b/Mathematics/GMS_cdiv_vec_zmm16r4.f90 index 5c3f1951..f4b884a4 100644 --- a/Mathematics/GMS_cdiv_vec_zmm16r4.f90 +++ b/Mathematics/GMS_cdiv_vec_zmm16r4.f90 @@ -770,6 +770,423 @@ subroutine cdivv_kernel_v512_32x16_ps(xre,xim,yre,yim,zre,zim,n) return end if end subroutine cdivv_kernel_v512_32x16_ps + + +subroutine cdivv_kernel_v512_16x16_ps(xre,xim,yre,yim,zre,zim,n) +#if defined(__ICC) || defined(__INTEL_COMPILER) + !DIR$ ATTRIBUTES CODE_ALIGN : 32 :: cdivv_kernel_v512_16x16_ps + !DIR$ OPTIMIZE : 3 + !DIR$ ATTRIBUTES OPTIMIZATION_PARAMETER: TARGET_ARCH=skylake_avx512 :: cdivv_kernel_v512_16x16_ps +#endif + real(kind=sp), allocatable, dimension(:), intent(in) :: xre + real(kind=sp), allocatable, dimension(:), intent(in) :: xim + real(kind=sp), allocatable, dimension(:), intent(in) :: yre + real(kind=sp), allocatable, dimension(:), intent(in) :: yim + real(kind=sp), allocatable, dimension(:), intent(out) :: zre + real(kind=sp), allocatable, dimension(:), intent(out) :: zim + integer(i4), intent(in) :: n + + type(ZMM16r4_t), automatic :: zmm0 + type(ZMM16r4_t), automatic :: zmm1 + type(ZMM16r4_t), automatic :: zmm2 + type(ZMM16r4_t), automatic :: zmm3 + type(ZMM16r4_t), automatic :: zmm4 + type(ZMM16r4_t), automatic :: zmm5 + type(ZMM16r4_t), automatic :: zmm6 + type(ZMM16r4_t), automatic :: zmm7 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm0 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm1 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm2 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm3 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm4 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm5 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm6 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm7 + type(XMM4r4_t), automatic :: xmm0 + type(XMM4r4_t), automatic :: xmm1 + type(XMM4r4_t), automatic :: xmm2 + type(XMM4r4_t), automatic :: xmm3 + type(XMM4r4_t), automatic :: xmm4 + type(XMM4r4_t), automatic :: xmm5 + type(XMM4r4_t), automatic :: xmm6 + type(XMM4r4_t), automatic :: xmm7 + type(YMM8r4_t), automatic :: ymm0 + type(YMM8r4_t), automatic :: ymm1 + type(YMM8r4_t), automatic :: ymm2 + type(YMM8r4_t), automatic :: ymm3 + type(YMM8r4_t), automatic :: ymm4 + type(YMM8r4_t), automatic :: ymm5 + type(YMM8r4_t), automatic :: ymm6 + type(YMM8r4_t), automatic :: ymm7 + real(sp), automatic :: xr + real(sp), automatic :: xi + real(sp), automatic :: yr + real(sp), automatic :: yi + real(sp), automatic :: tre + real(sp), automatic :: tim + real(sp), automatic :: den + integer(i4), automatic :: i,ii,j + integer(i4), automatic :: idx1,idx2,idx3,idx4 + integer(i4), automatic :: idx5,idx6,idx7,idx8 + integer(i4), automatic :: idx9,idx10,idx11,idx12 + integer(i4), automatic :: idx13,idx14,idx15 + + if(n<=0) then + return + else if(n==1) then + xr = xre(0) + yr = yre(0) + xi = xim(0) + yi = yim(0) + tre = (xr*yi)+(xi*yi) + tim = (xi*yr)-(xr*yi) + den = (yr*yr)+(yi*yi) + zre(0) = tre/den + zim(0) = tim/den + return + else if(n>1 && n<=4) then +!$omp simd linear(i:1) + do i=0, 3 + xmm0.v(i) = xre(i) ! xr + xmm1.v(i) = yre(i) ! yr + xmm2.v(i) = xim(i) ! xi + xmm3.v(i) = yim(i) ! yi + xmm4.v(i) = (xmm0.v(i)*xmm3.v(i)+ & + (xmm2.v(i)*xmm3.v(i)) ! tre + xmm5.v(i) = (xmm2.v(i)*xmm1.v(i)- & + (xmm0.v(i)*xmm3.v(i)) ! tim + xmm6.v(i) = (xmm1.v(i)*xmm1.v(i)+ & + (xmm3.v(i)*xmm3.v(i)) ! den + zre(i) = xmm4.v(i)/xmm6.v(i) + zim(i) = xmm5.v(i)/xmm6.v(i) + end do + return + else if(n>4 && n<=8) then +!$omp simd linear(i:1) + do i=0, 7 + ymm0.v(i) = xre(i) ! xr + ymm1.v(i) = yre(i) ! yr + ymm2.v(i) = xim(i) ! xi + ymm3.v(i) = yim(i) ! yi + ymm4.v(i) = (ymm0.v(i)*ymm3.v(i)+ & + (ymm2.v(i)*ymm3.v(i)) ! tre + ymm5.v(i) = (ymm2.v(i)*ymm1.v(i)- & + (ymm0.v(i)*ymm3.v(i)) ! tim + ymm6.v(i) = (ymm1.v(i)*ymm1.v(i)+ & + (ymm3.v(i)*ymm3.v(i)) ! den + zre(i) = ymm4.v(i)/ymm6.v(i) + zim(i) = ymm5.v(i)/ymm6.v(i) + end do + return + else if(n>8 && n<=16) then +!$omp simd linear(i:1) + do i=0, 15 + zmm0.v(i) = xre(i) ! xr + zmm1.v(i) = yre(i) ! yr + zmm2.v(i) = xim(i) ! xi + zmm3.v(i) = yim(i) ! yi + zmm4.v(i) = (zmm0.v(i)*zmm3.v(i)+ & + (zmm2.v(i)*zmm3.v(i)) ! tre + zmm5.v(i) = (zmm2.v(i)*zmm1.v(i)- & + (zmm0.v(i)*zmm3.v(i)) ! tim + zmm6.v(i) = (zmm1.v(i)*zmm1.v(i)+ & + (zmm3.v(i)*zmm3.v(i)) ! den + zre(i) = zmm4.v(i)/zmm6.v(i) + zim(i) = zmm5.v(i)/zmm6.v(i) + end do + return + else if(n>16 && n<=64) then + do i = 0,iand(n-1,inot(15)),16 +!$omp simd aligned(xim:64,xre,yre,yim,zre,zim) linear(ii:1) + do ii = 0, 15 + zmm0.v(ii) = xre(i+ii) ! xr + zmm1.v(ii) = yre(i+ii) ! yr + zmm2.v(ii) = xim(i+ii) ! xi + zmm3.v(ii) = yim(i+ii) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zre(i+ii) = zmm4.v(ii)/zmm6.v(ii) + zim(i+ii) = zmm5.v(ii)/zmm6.v(ii) + end do + end do +#if defined(__ICC) || defined(__INTEL_COMPILER) + !DIR$ LOOP COUNT MAX=16, MIN=1, AVG=8 +#endif + do j = i, n-1 + xr = xre(j) + yr = yre(j) + xi = xim(j) + yi = yim(j) + tre = (xr*yi)+(xi*yi) + tim = (xi*yr)-(xr*yi) + den = (yr*yr)+(yi*yi) + zre(j) = tre/den + zim(j) = tim/den + end do + return + else if(n>64) then + do i=0, iand(n-1,inot(ZMM_LEN-1)), ZMM_LEN*16 + call mm_prefetch(xre(i+16*ZMM_LEN),FOR_K_PREFETCH_T1) + call mm_prefetch(xim(i+16*ZMM_LEN),FOR_K_PREFETCH_T1) + call mm_prefetch(yre(i+16*ZMM_LEN),FOR_K_PREFETCH_T1) + call mm_prefetch(yim(i+16*ZMM_LEN),FOR_K_PREFETCH_T1) +#if defined(__ICC) || defined(__INTEL_COMPILER) + !dir$ assume_aligned xre:64 + !dir$ assume_aligned xim:64 + !dir$ assume_aligned yre:64 + !dir$ assume_aligned yim:64 + !dir$ assume_aligned zre:64 + !dir$ assume_aligned zim:64 +#endif +!$omp simd aligned(xim:64,xre,cabs) linear(ii:1) + do ii = 0, ZMM_LEN-1 + zmm0.v(ii) = xre(i+0+ii) ! xr + zmm1.v(ii) = yre(i+0+ii) ! yr + zmm2.v(ii) = xim(i+0+ii) ! xi + zmm3.v(ii) = yim(i+0+ii) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(i+0+ii) = zmm4.v(ii)*zmm7.v(ii) + zim(i+0+ii) = zmm5.v(ii)*zmm7.v(ii) + idx1 = i+1*ZMM_LEN+ii + zmm0.v(ii) = xre(idx1) ! xr + zmm1.v(ii) = yre(idx1) ! yr + zmm2.v(ii) = xim(idx1) ! xi + zmm3.v(ii) = yim(idx1) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx1) = zmm4.v(ii)*zmm7.v(ii) + zim(idx1) = zmm5.v(ii)*zmm7.v(ii) + idx2 = i+2*ZMM_LEN+ii + zmm0.v(ii) = xre(idx2) ! xr + zmm1.v(ii) = yre(idx2) ! yr + zmm2.v(ii) = xim(idx2) ! xi + zmm3.v(ii) = yim(idx2) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx2) = zmm4.v(ii)*zmm7.v(ii) + zim(idx2) = zmm5.v(ii)*zmm7.v(ii) + idx3 = i+3*ZMM_LEN+ii + zmm0.v(ii) = xre(idx3) ! xr + zmm1.v(ii) = yre(idx3) ! yr + zmm2.v(ii) = xim(idx3) ! xi + zmm3.v(ii) = yim(idx3) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx3) = zmm4.v(ii)*zmm7.v(ii) + zim(idx3) = zmm5.v(ii)*zmm7.v(ii) + idx4 = i+4*ZMM_LEN+ii + zmm0.v(ii) = xre(idx4) ! xr + zmm1.v(ii) = yre(idx4) ! yr + zmm2.v(ii) = xim(idx4) ! xi + zmm3.v(ii) = yim(idx4) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx4) = zmm4.v(ii)*zmm7.v(ii) + zim(idx4) = zmm5.v(ii)*zmm7.v(ii) + idx5 = i+5*ZMM_LEN+ii + zmm0.v(ii) = xre(idx5) ! xr + zmm1.v(ii) = yre(idx5) ! yr + zmm2.v(ii) = xim(idx5) ! xi + zmm3.v(ii) = yim(idx5) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx5) = zmm4.v(ii)*zmm7.v(ii) + zim(idx5) = zmm5.v(ii)*zmm7.v(ii) + idx6 = i+6*ZMM_LEN+ii + zmm0.v(ii) = xre(idx6) ! xr + zmm1.v(ii) = yre(idx6) ! yr + zmm2.v(ii) = xim(idx6) ! xi + zmm3.v(ii) = yim(idx6) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx6) = zmm4.v(ii)*zmm7.v(ii) + zim(idx6) = zmm5.v(ii)*zmm7.v(ii) + idx7 = i+7*ZMM_LEN+ii + zmm0.v(ii) = xre(idx7) ! xr + zmm1.v(ii) = yre(idx7) ! yr + zmm2.v(ii) = xim(idx7) ! xi + zmm3.v(ii) = yim(idx7) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx7) = zmm4.v(ii)*zmm7.v(ii) + zim(idx7) = zmm5.v(ii)*zmm7.v(ii) + idx8 = i+8*ZMM_LEN+ii + zmm0.v(ii) = xre(idx8) ! xr + zmm1.v(ii) = yre(idx8) ! yr + zmm2.v(ii) = xim(idx8) ! xi + zmm3.v(ii) = yim(idx8) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx8) = zmm4.v(ii)*zmm7.v(ii) + zim(idx8) = zmm5.v(ii)*zmm7.v(ii) + idx9 = i+9*ZMM_LEN+ii + zmm0.v(ii) = xre(idx9) ! xr + zmm1.v(ii) = yre(idx9) ! yr + zmm2.v(ii) = xim(idx9) ! xi + zmm3.v(ii) = yim(idx9) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx9) = zmm4.v(ii)*zmm7.v(ii) + zim(idx9) = zmm5.v(ii)*zmm7.v(ii) + idx10 = i+10*ZMM_LEN+ii + zmm0.v(ii) = xre(idx10) ! xr + zmm1.v(ii) = yre(idx10) ! yr + zmm2.v(ii) = xim(idx10) ! xi + zmm3.v(ii) = yim(idx10) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx10) = zmm4.v(ii)*zmm7.v(ii) + zim(idx10) = zmm5.v(ii)*zmm7.v(ii) + idx11 = i+11*ZMM_LEN+ii + zmm0.v(ii) = xre(idx11) ! xr + zmm1.v(ii) = yre(idx11) ! yr + zmm2.v(ii) = xim(idx11) ! xi + zmm3.v(ii) = yim(idx11) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx11) = zmm4.v(ii)*zmm7.v(ii) + zim(idx11) = zmm5.v(ii)*zmm7.v(ii) + idx12 = i+12*ZMM_LEN+ii + zmm0.v(ii) = xre(idx12) ! xr + zmm1.v(ii) = yre(idx12) ! yr + zmm2.v(ii) = xim(idx12) ! xi + zmm3.v(ii) = yim(idx12) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx12) = zmm4.v(ii)*zmm7.v(ii) + zim(idx12) = zmm5.v(ii)*zmm7.v(ii) + idx13 = i+13*ZMM_LEN+ii + zmm0.v(ii) = xre(idx13) ! xr + zmm1.v(ii) = yre(idx13) ! yr + zmm2.v(ii) = xim(idx13) ! xi + zmm3.v(ii) = yim(idx13) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx13) = zmm4.v(ii)*zmm7.v(ii) + zim(idx13) = zmm5.v(ii)*zmm7.v(ii) + idx14 = i+14*ZMM_LEN+ii + zmm0.v(ii) = xre(idx14) ! xr + zmm1.v(ii) = yre(idx14) ! yr + zmm2.v(ii) = xim(idx14) ! xi + zmm3.v(ii) = yim(idx14) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx14) = zmm4.v(ii)*zmm7.v(ii) + zim(idx14) = zmm5.v(ii)*zmm7.v(ii) + idx15 = i+15*ZMM_LEN+ii + zmm0.v(ii) = xre(idx15) ! xr + zmm1.v(ii) = yre(idx15) ! yr + zmm2.v(ii) = xim(idx15) ! xi + zmm3.v(ii) = yim(idx15) ! yi + zmm4.v(ii) = (zmm0.v(ii)*zmm3.v(ii)+ & + (zmm2.v(ii)*zmm3.v(ii)) ! tre + zmm5.v(ii) = (zmm2.v(ii)*zmm1.v(ii)- & + (zmm0.v(ii)*zmm3.v(ii)) ! tim + zmm6.v(ii) = (zmm1.v(ii)*zmm1.v(ii)+ & + (zmm3.v(ii)*zmm3.v(ii)) ! den + zmm7.v(ii) = 1.0_sp/zmm6.v(ii) + zre(idx15) = zmm4.v(ii)*zmm7.v(ii) + zim(idx15) = zmm5.v(ii)*zmm7.v(ii) + end do + end do +#if defined(__ICC) || defined(__INTEL_COMPILER) + !DIR$ LOOP COUNT MAX=16, MIN=1, AVG=8 +#endif + do j = i, n-1 + xr = xre(j) + yr = yre(j) + xi = xim(j) + yi = yim(j) + tre = (xr*yi)+(xi*yi) + tim = (xi*yr)-(xr*yi) + den = (yr*yr)+(yi*yi) + zre(j) = tre/den + zim(j) = tim/den + end do + return + end if +end subroutine cdivv_kernel_v512_16x16_ps +