From 2e9c5368a5a77e697cef3365571ec6e39e0fd14e Mon Sep 17 00:00:00 2001 From: Bernard Gingold Date: Sat, 2 Nov 2024 08:39:37 +0100 Subject: [PATCH] Added `cpowv_v512_8x16_ps` kernel. --- Mathematics/GMS_cpow_vec_zmm16r4.f90 | 328 +++++++++++++++++++++++++++ 1 file changed, 328 insertions(+) diff --git a/Mathematics/GMS_cpow_vec_zmm16r4.f90 b/Mathematics/GMS_cpow_vec_zmm16r4.f90 index f66e2fa5..75c33019 100644 --- a/Mathematics/GMS_cpow_vec_zmm16r4.f90 +++ b/Mathematics/GMS_cpow_vec_zmm16r4.f90 @@ -1224,6 +1224,334 @@ subroutine cpowv_v512_16x16_ps(xre,xim,vn,cpowr,cpowi,n) return end if end subroutine cpowv_v512_16x16_ps + + +subroutine cpowv_v512_8x16_ps(xre,xim,vn,cpowr,cpowi,n) +#if defined(__ICC) || defined(__INTEL_COMPILER) + !DIR$ ATTRIBUTES CODE_ALIGN : 32 :: cpowv_v512_8x16_ps + !DIR$ OPTIMIZE : 3 + !DIR$ ATTRIBUTES OPTIMIZATION_PARAMETER: TARGET_ARCH=skylake_avx512 :: cpowv_v512_8x16_ps +#endif + real(kind=sp), allocatable, dimension(:), intent(in) :: xre + real(kind=sp), allocatable, dimension(:), intent(in) :: xim + real(kind=sp), allocatable, dimension(:), intent(in) :: vn + real(kind=sp), allocatable, dimension(:), intent(out) :: cpowr + real(kind=sp), allocatable, dimension(:), intent(out) :: cpowi + integer(kind=i4), intent(in) :: n + + type(ZMM16r4_t), automatic :: zmm0 + type(ZMM16r4_t), automatic :: zmm1 + type(ZMM16r4_t), automatic :: zmm2 + type(ZMM16r4_t), automatic :: zmm3 + type(ZMM16r4_t), automatic :: zmm4 + type(ZMM16r4_t), automatic :: zmm5 + type(ZMM16r4_t), automatic :: zmm6 + type(ZMM16r4_t), automatic :: zmm7 + type(ZMM16r4_t), automatic :: zmm8 + type(ZMM16r4_t), automatic :: zmm9 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm0 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm1 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm2 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm3 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm4 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm5 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm6 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm7 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm8 + !DIR$ ATTRIBUTES ALIGN : 64 :: zmm9 + type(XMM4r4_t), automatic :: xmm0 + type(XMM4r4_t), automatic :: xmm1 + type(XMM4r4_t), automatic :: xmm2 + type(XMM4r4_t), automatic :: xmm3 + type(XMM4r4_t), automatic :: xmm4 + type(XMM4r4_t), automatic :: xmm5 + type(XMM4r4_t), automatic :: xmm6 + type(XMM4r4_t), automatic :: xmm7 + type(XMM4r4_t), automatic :: xmm8 + type(XMM4r4_t), automatic :: xmm9 + type(YMM8r4_t), automatic :: ymm0 + type(YMM8r4_t), automatic :: ymm1 + type(YMM8r4_t), automatic :: ymm2 + type(YMM8r4_t), automatic :: ymm3 + type(YMM8r4_t), automatic :: ymm4 + type(YMM8r4_t), automatic :: ymm5 + type(YMM8r4_t), automatic :: ymm6 + type(YMM8r4_t), automatic :: ymm7 + type(YMM8r4_t), automatic :: ymm8 + type(YMM8r4_t), automatic :: ymm9 + real(sp), automatic :: z0 + real(sp), automatic :: z1 + real(sp), automatic :: z2 + real(sp), automatic :: z3 + real(sp), automatic :: z4 + real(sp), automatic :: z5 + real(sp), automatic :: z6 + real(sp), automatic :: z7 + real(sp), automatic :: z8 + real(sp), automatic :: z9 + real(sp), automatic :: zx + integer(i4), automatic :: i,ii,j + integer(i4), automatic :: idx1,idx2,idx3,idx4 + integer(i4), automatic :: idx5,idx6,idx7 + + if(n<=0) then + return + else if(n==1) then + z0 = xre(0) + z1 = z0*z0 + z2 = xim(0) + z3 = z2*z2 + z4 = sqrt(z1+z3) + zx = vn(0) + z5 = atan(z2/z0) + z6 = pow(z4,zx) + z7 = zx*z5 + z8 = z6*cos(z7) + cpowr(0) = z8 + z9 = z6*sin(z7) + cpowi(0) = z9 + return + else if(n>1 && n<=4) +!$omp simd linear(i:1) + do i=0, 3 + xmm0.v(i) = xre(i) + xmm1.v(i) = xmm0.v(i)*xmm0.v(i) + xmm2.v(i) = xim(i) + xmm3.v(i) = xmm2.v(i)*xmm2.v(i) + xmm4.v(i) = sqrt(xmm1.v(i)+xmm3.v(i)) + xmm9.v(i) = vn(i) + xmm5.v(i) = atan(xmm2.v(i)/xmm0.v(i)) + xmm6.v(i) = pow(xmm4.v(i),xmm9.v(i)) + xmm7.v(i) = xmm9.v(i)*xmm5.v(i) + xmm8.v(i) = xmm6.v(i)*cos(xmm7.v(i)) + cpowr(i) = xmm8.v(i) + xmm9.v(i) = xmm6.v(i)*sin(xmm7.v(i)) + cpowi(i) = xmm9.v(i) + end do + return + else if(n>4 && n<=8) then +!$omp simd linear(i:1) + do i=0, 7 + ymm0.v(i) = xre(i) + ymm1.v(i) = ymm0.v(i)*ymm0.v(i) + ymm2.v(i) = xim(i) + ymm3.v(i) = ymm2.v(i)*ymm2.v(i) + ymm4.v(i) = sqrt(ymm1.v(i)+ymm3.v(i)) + ymm9.v(i) = vn(i) + ymm5.v(i) = atan(ymm2.v(i)/ymm0.v(i)) + ymm6.v(i) = pow(ymm4.v(i),ymm9.v(i)) + ymm7.v(i) = ymm9.v(i)*ymm5.v(i) + ymm8.v(i) = ymm6.v(i)*cos(ymm7.v(i)) + cpowr(i) = ymm8.v(i) + ymm9.v(i) = ymm6.v(i)*sin(ymm7.v(i)) + cpowi(i) = ymm9.v(i) + end do + return + else if(n>8 && n<=16) then +!$omp simd linear(i:1) + do i=0, 15 + zmm0.v(i) = xre(i) + zmm1.v(i) = zmm0.v(i)*zmm0.v(i) + zmm2.v(i) = xim(i) + zmm3.v(i) = zmm2.v(i)*zmm2.v(i) + zmm4.v(i) = sqrt(zmm1.v(i)+zmm3.v(i)) + zmm9.v(i) = vn(i) + zmm5.v(i) = atan(zmm2.v(i)/zmm0.v(i)) + zmm6.v(i) = pow(zmm4.v(i),zmm9.v(i)) + zmm7.v(i) = zmm9.v(i)*zmm5.v(i) + zmm8.v(i) = zmm6.v(i)*cos(zmm7.v(i)) + cpowr(i) = zmm8.v(i) + zmm9.v(i) = zmm6.v(i)*sin(zmm7.v(i)) + cpowi(i) = zmm9.v(i) + end do + return + else if(n>16 && n<=32) then + do i = 0,iand(n-1,inot(15)),16 +!$omp simd aligned(xim:64,xre,vn,cpowr,cpowi) linear(ii:1) + do ii = 0, 15 + zmm0.v(ii) = xre(i+ii) + zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii) + zmm2.v(ii) = xim(i+ii) + zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii) + zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii)) + zmm9.v(ii) = vn(i+ii) + zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii)) + zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii)) + zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii) + zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii)) + cpowr(i+ii) = zmm8.v(ii) + zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii)) + cpowi(i+ii) = zmm9.v(ii) + end do + end do +#if defined(__ICC) || defined(__INTEL_COMPILER) + !DIR$ LOOP COUNT MAX=16, MIN=1, AVG=8 +#endif + do j = i, n-1 + z0 = xre(j) + z1 = z0*z0 + z2 = xim(j) + z3 = z2*z2 + z4 = sqrt(z1+z3) + zx = vn(j) + z5 = atan(z2/z0) + z6 = pow(z4,zx) + z7 = zx*z5 + z8 = z6*cos(z7) + cpowr(j) = z8 + z9 = z6*sin(z7) + cpowi(j) = z9 + end do + return + else if(n>32) then + do i=0, iand(n-1,inot(ZMM_LEN-1)), ZMM_LEN*8 + call mm_prefetch(xre(i+8*ZMM_LEN),FOR_K_PREFETCH_T1) + call mm_prefetch(xim(i+8*ZMM_LEN),FOR_K_PREFETCH_T1) + call mm_prefetch(vn(i+8*ZMM_LEN),FOR_K_PREFETCH_T1) +#if defined(__ICC) || defined(__INTEL_COMPILER) + !dir$ assume_aligned xre:64 + !dir$ assume_aligned xim:64 + !dir$ assume_aligned vn:64 + !dir$ assume_aligned cpowr:64 + !dir$ assume_aligned cpowi:64 +#endif +!$omp simd aligned(xim:64,xre,vn,cpowr,cpowi) linear(ii:1) + do ii = 0, ZMM_LEN-1 + zmm0.v(ii) = xre(i+0+ii) + zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii) + zmm2.v(ii) = xim(i+0+ii) + zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii) + zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii)) + zmm9.v(ii) = vn(i+0+ii) + zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii)) + zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii)) + zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii) + zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii)) + cpowr(i+0+ii) = zmm8.v(ii) + zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii)) + cpowi(i+0+ii) = zmm9.v(ii) + idx1 = i+1*ZMM_LEN+ii + zmm0.v(ii) = xre(idx1) + zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii) + zmm2.v(ii) = xim(idx1) + zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii) + zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii)) + zmm9.v(ii) = vn(idx1) + zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii)) + zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii)) + zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii) + zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii)) + cpowr(idx1) = zmm8.v(ii) + zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii)) + cpowi(idx1) = zmm9.v(ii) + idx2 = i+2*ZMM_LEN+ii + zmm0.v(ii) = xre(idx2) + zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii) + zmm2.v(ii) = xim(idx2) + zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii) + zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii)) + zmm9.v(ii) = vn(idx2) + zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii)) + zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii)) + zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii) + zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii)) + cpowr(idx2) = zmm8.v(ii) + zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii)) + cpowi(idx2) = zmm9.v(ii) + idx3 = i+3*ZMM_LEN+ii + zmm0.v(ii) = xre(idx3) + zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii) + zmm2.v(ii) = xim(idx3) + zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii) + zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii)) + zmm9.v(ii) = vn(idx3) + zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii)) + zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii)) + zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii) + zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii)) + cpowr(idx3) = zmm8.v(ii) + zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii)) + cpowi(idx3) = zmm9.v(ii) + idx4 = i+4*ZMM_LEN+ii + zmm0.v(ii) = xre(idx4) + zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii) + zmm2.v(ii) = xim(idx4) + zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii) + zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii)) + zmm9.v(ii) = vn(idx4) + zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii)) + zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii)) + zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii) + zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii)) + cpowr(idx4) = zmm8.v(ii) + zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii)) + cpowi(idx4) = zmm9.v(ii) + idx5 = i+5*ZMM_LEN+ii + zmm0.v(ii) = xre(idx5) + zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii) + zmm2.v(ii) = xim(idx5) + zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii) + zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii)) + zmm9.v(ii) = vn(idx5) + zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii)) + zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii)) + zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii) + zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii)) + cpowr(idx5) = zmm8.v(ii) + zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii)) + cpowi(idx5) = zmm9.v(ii) + idx6 = i+6*ZMM_LEN+ii + zmm0.v(ii) = xre(idx6) + zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii) + zmm2.v(ii) = xim(idx6) + zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii) + zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii)) + zmm9.v(ii) = vn(idx6) + zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii)) + zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii)) + zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii) + zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii)) + cpowr(idx6) = zmm8.v(ii) + zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii)) + cpowi(idx6) = zmm9.v(ii) + idx7 = i+7*ZMM_LEN+ii + zmm0.v(ii) = xre(idx7) + zmm1.v(ii) = zmm0.v(ii)*zmm0.v(ii) + zmm2.v(ii) = xim(idx7) + zmm3.v(ii) = zmm2.v(ii)*zmm2.v(ii) + zmm4.v(ii) = sqrt(zmm1.v(ii)+zmm3.v(ii)) + zmm9.v(ii) = vn(idx7) + zmm5.v(ii) = atan(zmm2.v(ii)/zmm0.v(ii)) + zmm6.v(ii) = pow(zmm4.v(ii),zmm9.v(ii)) + zmm7.v(ii) = zmm9.v(ii)*zmm5.v(ii) + zmm8.v(ii) = zmm6.v(ii)*cos(zmm7.v(ii)) + cpowr(idx7) = zmm8.v(ii) + zmm9.v(ii) = zmm6.v(ii)*sin(zmm7.v(ii)) + cpowi(idx7) = zmm9.v(ii) + end do + end do +#if defined(__ICC) || defined(__INTEL_COMPILER) + !DIR$ LOOP COUNT MAX=16, MIN=1, AVG=8 +#endif + do j = i, n-1 + z0 = xre(j) + z1 = z0*z0 + z2 = xim(j) + z3 = z2*z2 + z4 = sqrt(z1+z3) + zx = vn(j) + z5 = atan(z2/z0) + z6 = pow(z4,zx) + z7 = zx*z5 + z8 = z6*cos(z7) + cpowr(j) = z8 + z9 = z6*sin(z7) + cpowi(j) = z9 + end do + return + end if +end subroutine cpowv_v512_8x16_ps +