From e72fe6cf0a42571ec81941be7269727c6356185a Mon Sep 17 00:00:00 2001 From: Intak Hwang Date: Tue, 18 Feb 2025 06:53:05 +0000 Subject: [PATCH] chore: rename vw to vTw --- internal/asmgen/fft.go | 150 +++++++++++----------- math/poly/asm_fft.go | 250 ++++++++++++++++++------------------- math/poly/asm_fft_amd64.go | 250 ++++++++++++++++++------------------- 3 files changed, 325 insertions(+), 325 deletions(-) diff --git a/internal/asmgen/fft.go b/internal/asmgen/fft.go index 3cfe004..97bce10 100644 --- a/internal/asmgen/fft.go +++ b/internal/asmgen/fft.go @@ -39,23 +39,23 @@ func fftInPlaceAVX2() { VMOVUPD(Mem{Base: coeffs, Index: jt, Scale: 8}, vReal) VMOVUPD(Mem{Base: coeffs, Index: jt, Scale: 8, Disp: 32}, vImag) - vwReal := YMM() - VMULPD(wReal, vReal, vwReal) - VFNMADD231PD(wImag, vImag, vwReal) + vTwReal := YMM() + VMULPD(wReal, vReal, vTwReal) + VFNMADD231PD(wImag, vImag, vTwReal) - vwImag := YMM() - VMULPD(wImag, vReal, vwImag) - VFMADD231PD(wReal, vImag, vwImag) + vTwImag := YMM() + VMULPD(wImag, vReal, vTwImag) + VFMADD231PD(wReal, vImag, vTwImag) uOutReal := YMM() - VADDPD(vwReal, uReal, uOutReal) + VADDPD(vTwReal, uReal, uOutReal) uOutImag := YMM() - VADDPD(vwImag, uImag, uOutImag) + VADDPD(vTwImag, uImag, uOutImag) vOutReal := YMM() - VSUBPD(vwReal, uReal, vOutReal) + VSUBPD(vTwReal, uReal, vOutReal) vOutImag := YMM() - VSUBPD(vwImag, uImag, vOutImag) + VSUBPD(vTwImag, uImag, vOutImag) VMOVUPD(uOutReal, Mem{Base: coeffs, Index: j, Scale: 8}) VMOVUPD(uOutImag, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 32}) @@ -113,23 +113,23 @@ func fftInPlaceAVX2() { VMOVUPD(Mem{Base: coeffs, Index: jt, Scale: 8}, vReal) VMOVUPD(Mem{Base: coeffs, Index: jt, Scale: 8, Disp: 32}, vImag) - vwReal = YMM() - VMULPD(wReal, vReal, vwReal) - VFNMADD231PD(wImag, vImag, vwReal) + vTwReal = YMM() + VMULPD(wReal, vReal, vTwReal) + VFNMADD231PD(wImag, vImag, vTwReal) - vwImag = YMM() - VMULPD(wImag, vReal, vwImag) - VFMADD231PD(wReal, vImag, vwImag) + vTwImag = YMM() + VMULPD(wImag, vReal, vTwImag) + VFMADD231PD(wReal, vImag, vTwImag) uOutReal = YMM() - VADDPD(vwReal, uReal, uOutReal) + VADDPD(vTwReal, uReal, uOutReal) uOutImag = YMM() - VADDPD(vwImag, uImag, uOutImag) + VADDPD(vTwImag, uImag, uOutImag) vOutReal = YMM() - VSUBPD(vwReal, uReal, vOutReal) + VSUBPD(vTwReal, uReal, vOutReal) vOutImag = YMM() - VSUBPD(vwImag, uImag, vOutImag) + VSUBPD(vTwImag, uImag, vOutImag) VMOVUPD(uOutReal, Mem{Base: coeffs, Index: j, Scale: 8}) VMOVUPD(uOutImag, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 32}) @@ -173,21 +173,21 @@ func fftInPlaceAVX2() { VMOVUPD(Mem{Base: coeffs, Index: j, Scale: 8, Disp: 32}, uImag) VMOVUPD(Mem{Base: coeffs, Index: j, Scale: 8, Disp: 48}, vImag) - vwReal = XMM() - VMULPD(wReal, vReal, vwReal) - VFNMADD231PD(wImag, vImag, vwReal) + vTwReal = XMM() + VMULPD(wReal, vReal, vTwReal) + VFNMADD231PD(wImag, vImag, vTwReal) - vwImag = XMM() - VMULPD(wImag, vReal, vwImag) - VFMADD231PD(wReal, vImag, vwImag) + vTwImag = XMM() + VMULPD(wImag, vReal, vTwImag) + VFMADD231PD(wReal, vImag, vTwImag) uOutReal, vOutReal = XMM(), XMM() - VADDPD(vwReal, uReal, uOutReal) - VSUBPD(vwReal, uReal, vOutReal) + VADDPD(vTwReal, uReal, uOutReal) + VSUBPD(vTwReal, uReal, vOutReal) uOutImag, vOutImag = XMM(), XMM() - VADDPD(vwImag, uImag, uOutImag) - VSUBPD(vwImag, uImag, vOutImag) + VADDPD(vTwImag, uImag, uOutImag) + VSUBPD(vTwImag, uImag, vOutImag) VMOVUPD(uOutReal, Mem{Base: coeffs, Index: j, Scale: 8}) VMOVUPD(vOutReal, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 16}) @@ -227,10 +227,10 @@ func fftInPlaceAVX2() { VSHUFPD(Imm(0b1111), uRealvReal, uRealvReal, vReal) VSHUFPD(Imm(0b1111), uImagvImag, uImagvImag, vImag) - // vwRealImag: (vwReal0, vwImag0, vwReal1, vwImag1) - vwRealImag := YMM() - VMULPD(wImagReal, vImag, vwRealImag) - VFMADDSUB231PD(wRealImag, vReal, vwRealImag) + // vTwRealImag: (vTwReal0, vTwImag0, vTwReal1, vTwImag1) + vTwRealImag := YMM() + VMULPD(wImagReal, vImag, vTwRealImag) + VFMADDSUB231PD(wRealImag, vReal, vTwRealImag) // uReal: (uReal0, uReal0, uReal1, uReal1) // uImag: (uImag0, uImag0, uImag1, uImag1) @@ -238,18 +238,18 @@ func fftInPlaceAVX2() { VSHUFPD(Imm(0b0000), uRealvReal, uRealvReal, uReal) VSHUFPD(Imm(0b0000), uImagvImag, uImagvImag, uImag) - // vwReal: (vwReal0, vwReal0, vwReal1, vwReal1) - // vwImag: (vwImag0, vwImag0, vwImag1, vwImag1) - vwReal, vwImag = YMM(), YMM() - VSHUFPD(Imm(0b0000), vwRealImag, vwRealImag, vwReal) - VSHUFPD(Imm(0b1111), vwRealImag, vwRealImag, vwImag) + // vTwReal: (vTwReal0, vTwReal0, vTwReal1, vTwReal1) + // vTwImag: (vTwImag0, vTwImag0, vTwImag1, vTwImag1) + vTwReal, vTwImag = YMM(), YMM() + VSHUFPD(Imm(0b0000), vTwRealImag, vTwRealImag, vTwReal) + VSHUFPD(Imm(0b1111), vTwRealImag, vTwRealImag, vTwImag) - VSUBPD(vwReal, zero, vwReal) - VSUBPD(vwImag, zero, vwImag) + VSUBPD(vTwReal, zero, vTwReal) + VSUBPD(vTwImag, zero, vTwImag) uOut, vOut := YMM(), YMM() - VADDSUBPD(vwReal, uReal, uOut) - VADDSUBPD(vwImag, uImag, vOut) + VADDSUBPD(vTwReal, uReal, uOut) + VADDSUBPD(vTwImag, uImag, vOut) VMOVUPD(uOut, Mem{Base: coeffs, Index: j, Scale: 8}) VMOVUPD(vOut, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 32}) @@ -314,14 +314,14 @@ func ifftInPlaceAVX2() { VSHUFPD(Imm(0b0000), vOutRealImag, vOutRealImag, vOutReal) VSHUFPD(Imm(0b1111), vOutRealImag, vOutRealImag, vOutImag) - // vwOutRealImag: (vwOutReal0, vwOutImag0, vwOutReal1, vwOutImag1) - vwOutRealImag := YMM() - VMULPD(wImagReal, vOutImag, vwOutRealImag) - VFMADDSUB231PD(wRealImag, vOutReal, vwOutRealImag) + // vTwOutRealImag: (vTwOutReal0, vTwOutImag0, vTwOutReal1, vTwOutImag1) + vTwOutRealImag := YMM() + VMULPD(wImagReal, vOutImag, vTwOutRealImag) + VFMADDSUB231PD(wRealImag, vOutReal, vTwOutRealImag) uOut, vOut := YMM(), YMM() - VSHUFPD(Imm(0b0000), vwOutRealImag, uOutReal, uOut) - VSHUFPD(Imm(0b1111), vwOutRealImag, uOutImag, vOut) + VSHUFPD(Imm(0b0000), vTwOutRealImag, uOutReal, uOut) + VSHUFPD(Imm(0b1111), vTwOutRealImag, uOutImag, vOut) VMOVUPD(uOut, Mem{Base: coeffs, Index: j, Scale: 8}) VMOVUPD(vOut, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 32}) @@ -358,18 +358,18 @@ func ifftInPlaceAVX2() { VSUBPD(vReal, uReal, vOutReal) VSUBPD(vImag, uImag, vOutImag) - vwOutReal := XMM() - VMULPD(wReal, vOutReal, vwOutReal) - VFNMADD231PD(wImag, vOutImag, vwOutReal) + vTwOutReal := XMM() + VMULPD(wReal, vOutReal, vTwOutReal) + VFNMADD231PD(wImag, vOutImag, vTwOutReal) - vwOutImag := XMM() - VMULPD(wImag, vOutReal, vwOutImag) - VFMADD231PD(wReal, vOutImag, vwOutImag) + vTwOutImag := XMM() + VMULPD(wImag, vOutReal, vTwOutImag) + VFMADD231PD(wReal, vOutImag, vTwOutImag) VMOVUPD(uOutReal, Mem{Base: coeffs, Index: j, Scale: 8}) - VMOVUPD(vwOutReal, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 16}) + VMOVUPD(vTwOutReal, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 16}) VMOVUPD(uOutImag, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 32}) - VMOVUPD(vwOutImag, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 48}) + VMOVUPD(vTwOutImag, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 48}) ADDQ(Imm(8), j) @@ -425,18 +425,18 @@ func ifftInPlaceAVX2() { VSUBPD(vReal, uReal, vOutReal) VSUBPD(vImag, uImag, vOutImag) - vwOutReal = YMM() - VMULPD(wReal, vOutReal, vwOutReal) - VFNMADD231PD(wImag, vOutImag, vwOutReal) + vTwOutReal = YMM() + VMULPD(wReal, vOutReal, vTwOutReal) + VFNMADD231PD(wImag, vOutImag, vTwOutReal) - vwOutImag = YMM() - VMULPD(wImag, vOutReal, vwOutImag) - VFMADD231PD(wReal, vOutImag, vwOutImag) + vTwOutImag = YMM() + VMULPD(wImag, vOutReal, vTwOutImag) + VFMADD231PD(wReal, vOutImag, vTwOutImag) VMOVUPD(uOutReal, Mem{Base: coeffs, Index: j, Scale: 8}) VMOVUPD(uOutImag, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 32}) - VMOVUPD(vwOutReal, Mem{Base: coeffs, Index: jt, Scale: 8}) - VMOVUPD(vwOutImag, Mem{Base: coeffs, Index: jt, Scale: 8, Disp: 32}) + VMOVUPD(vTwOutReal, Mem{Base: coeffs, Index: jt, Scale: 8}) + VMOVUPD(vTwOutImag, Mem{Base: coeffs, Index: jt, Scale: 8, Disp: 32}) ADDQ(Imm(8), j) ADDQ(Imm(8), jt) @@ -490,23 +490,23 @@ func ifftInPlaceAVX2() { VSUBPD(vReal, uReal, vOutReal) VSUBPD(vImag, uImag, vOutImag) - vwOutReal = YMM() - VMULPD(wReal, vOutReal, vwOutReal) - VFNMADD231PD(wImag, vOutImag, vwOutReal) + vTwOutReal = YMM() + VMULPD(wReal, vOutReal, vTwOutReal) + VFNMADD231PD(wImag, vOutImag, vTwOutReal) - vwOutImag = YMM() - VMULPD(wImag, vOutReal, vwOutImag) - VFMADD231PD(wReal, vOutImag, vwOutImag) + vTwOutImag = YMM() + VMULPD(wImag, vOutReal, vTwOutImag) + VFMADD231PD(wReal, vOutImag, vTwOutImag) VMULPD(scale, uOutReal, uOutReal) VMULPD(scale, uOutImag, uOutImag) - VMULPD(scale, vwOutReal, vwOutReal) - VMULPD(scale, vwOutImag, vwOutImag) + VMULPD(scale, vTwOutReal, vTwOutReal) + VMULPD(scale, vTwOutImag, vTwOutImag) VMOVUPD(uOutReal, Mem{Base: coeffs, Index: j, Scale: 8}) VMOVUPD(uOutImag, Mem{Base: coeffs, Index: j, Scale: 8, Disp: 32}) - VMOVUPD(vwOutReal, Mem{Base: coeffs, Index: jt, Scale: 8}) - VMOVUPD(vwOutImag, Mem{Base: coeffs, Index: jt, Scale: 8, Disp: 32}) + VMOVUPD(vTwOutReal, Mem{Base: coeffs, Index: jt, Scale: 8}) + VMOVUPD(vTwOutImag, Mem{Base: coeffs, Index: jt, Scale: 8, Disp: 32}) ADDQ(Imm(8), j) ADDQ(Imm(8), jt) diff --git a/math/poly/asm_fft.go b/math/poly/asm_fft.go index 8c431e8..4203e6a 100644 --- a/math/poly/asm_fft.go +++ b/math/poly/asm_fft.go @@ -32,35 +32,35 @@ func fftInPlace(coeffs []float64, tw []complex128) { vImag2 := coeffs[j+N/2+6] vImag3 := coeffs[j+N/2+7] - vwReal0 := vReal0*wReal - vImag0*wImag - vwReal1 := vReal1*wReal - vImag1*wImag - vwReal2 := vReal2*wReal - vImag2*wImag - vwReal3 := vReal3*wReal - vImag3*wImag - - vwImag0 := vReal0*wImag + vImag0*wReal - vwImag1 := vReal1*wImag + vImag1*wReal - vwImag2 := vReal2*wImag + vImag2*wReal - vwImag3 := vReal3*wImag + vImag3*wReal - - uOutReal0 := uReal0 + vwReal0 - uOutReal1 := uReal1 + vwReal1 - uOutReal2 := uReal2 + vwReal2 - uOutReal3 := uReal3 + vwReal3 - - uOutImag0 := uImag0 + vwImag0 - uOutImag1 := uImag1 + vwImag1 - uOutImag2 := uImag2 + vwImag2 - uOutImag3 := uImag3 + vwImag3 - - vOutReal0 := uReal0 - vwReal0 - vOutReal1 := uReal1 - vwReal1 - vOutReal2 := uReal2 - vwReal2 - vOutReal3 := uReal3 - vwReal3 - - vOutImag0 := uImag0 - vwImag0 - vOutImag1 := uImag1 - vwImag1 - vOutImag2 := uImag2 - vwImag2 - vOutImag3 := uImag3 - vwImag3 + vTwReal0 := vReal0*wReal - vImag0*wImag + vTwReal1 := vReal1*wReal - vImag1*wImag + vTwReal2 := vReal2*wReal - vImag2*wImag + vTwReal3 := vReal3*wReal - vImag3*wImag + + vTwImag0 := vReal0*wImag + vImag0*wReal + vTwImag1 := vReal1*wImag + vImag1*wReal + vTwImag2 := vReal2*wImag + vImag2*wReal + vTwImag3 := vReal3*wImag + vImag3*wReal + + uOutReal0 := uReal0 + vTwReal0 + uOutReal1 := uReal1 + vTwReal1 + uOutReal2 := uReal2 + vTwReal2 + uOutReal3 := uReal3 + vTwReal3 + + uOutImag0 := uImag0 + vTwImag0 + uOutImag1 := uImag1 + vTwImag1 + uOutImag2 := uImag2 + vTwImag2 + uOutImag3 := uImag3 + vTwImag3 + + vOutReal0 := uReal0 - vTwReal0 + vOutReal1 := uReal1 - vTwReal1 + vOutReal2 := uReal2 - vTwReal2 + vOutReal3 := uReal3 - vTwReal3 + + vOutImag0 := uImag0 - vTwImag0 + vOutImag1 := uImag1 - vTwImag1 + vOutImag2 := uImag2 - vTwImag2 + vOutImag3 := uImag3 - vTwImag3 coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 @@ -116,35 +116,35 @@ func fftInPlace(coeffs []float64, tw []complex128) { vImag2 := coeffs[j+t+6] vImag3 := coeffs[j+t+7] - vwReal0 := vReal0*wReal - vImag0*wImag - vwReal1 := vReal1*wReal - vImag1*wImag - vwReal2 := vReal2*wReal - vImag2*wImag - vwReal3 := vReal3*wReal - vImag3*wImag + vTwReal0 := vReal0*wReal - vImag0*wImag + vTwReal1 := vReal1*wReal - vImag1*wImag + vTwReal2 := vReal2*wReal - vImag2*wImag + vTwReal3 := vReal3*wReal - vImag3*wImag - vwImag0 := vReal0*wImag + vImag0*wReal - vwImag1 := vReal1*wImag + vImag1*wReal - vwImag2 := vReal2*wImag + vImag2*wReal - vwImag3 := vReal3*wImag + vImag3*wReal + vTwImag0 := vReal0*wImag + vImag0*wReal + vTwImag1 := vReal1*wImag + vImag1*wReal + vTwImag2 := vReal2*wImag + vImag2*wReal + vTwImag3 := vReal3*wImag + vImag3*wReal - uOutReal0 := uReal0 + vwReal0 - uOutReal1 := uReal1 + vwReal1 - uOutReal2 := uReal2 + vwReal2 - uOutReal3 := uReal3 + vwReal3 + uOutReal0 := uReal0 + vTwReal0 + uOutReal1 := uReal1 + vTwReal1 + uOutReal2 := uReal2 + vTwReal2 + uOutReal3 := uReal3 + vTwReal3 - uOutImag0 := uImag0 + vwImag0 - uOutImag1 := uImag1 + vwImag1 - uOutImag2 := uImag2 + vwImag2 - uOutImag3 := uImag3 + vwImag3 + uOutImag0 := uImag0 + vTwImag0 + uOutImag1 := uImag1 + vTwImag1 + uOutImag2 := uImag2 + vTwImag2 + uOutImag3 := uImag3 + vTwImag3 - vOutReal0 := uReal0 - vwReal0 - vOutReal1 := uReal1 - vwReal1 - vOutReal2 := uReal2 - vwReal2 - vOutReal3 := uReal3 - vwReal3 + vOutReal0 := uReal0 - vTwReal0 + vOutReal1 := uReal1 - vTwReal1 + vOutReal2 := uReal2 - vTwReal2 + vOutReal3 := uReal3 - vTwReal3 - vOutImag0 := uImag0 - vwImag0 - vOutImag1 := uImag1 - vwImag1 - vOutImag2 := uImag2 - vwImag2 - vOutImag3 := uImag3 - vwImag3 + vOutImag0 := uImag0 - vTwImag0 + vOutImag1 := uImag1 - vTwImag1 + vOutImag2 := uImag2 - vTwImag2 + vOutImag3 := uImag3 - vTwImag3 coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 @@ -186,23 +186,23 @@ func fftInPlace(coeffs []float64, tw []complex128) { vImag0 := coeffs[j+6] vImag1 := coeffs[j+7] - vwReal0 := vReal0*wReal - vImag0*wImag - vwReal1 := vReal1*wReal - vImag1*wImag + vTwReal0 := vReal0*wReal - vImag0*wImag + vTwReal1 := vReal1*wReal - vImag1*wImag - vwImag0 := vReal0*wImag + vImag0*wReal - vwImag1 := vReal1*wImag + vImag1*wReal + vTwImag0 := vReal0*wImag + vImag0*wReal + vTwImag1 := vReal1*wImag + vImag1*wReal - uOutReal0 := uReal0 + vwReal0 - uOutReal1 := uReal1 + vwReal1 + uOutReal0 := uReal0 + vTwReal0 + uOutReal1 := uReal1 + vTwReal1 - vOutReal0 := uReal0 - vwReal0 - vOutReal1 := uReal1 - vwReal1 + vOutReal0 := uReal0 - vTwReal0 + vOutReal1 := uReal1 - vTwReal1 - uOutImag0 := uImag0 + vwImag0 - uOutImag1 := uImag1 + vwImag1 + uOutImag0 := uImag0 + vTwImag0 + uOutImag1 := uImag1 + vTwImag1 - vOutImag0 := uImag0 - vwImag0 - vOutImag1 := uImag1 - vwImag1 + vOutImag0 := uImag0 - vTwImag0 + vOutImag1 := uImag1 - vTwImag1 coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 @@ -236,23 +236,23 @@ func fftInPlace(coeffs []float64, tw []complex128) { uImag1 := coeffs[j+6] vImag1 := coeffs[j+7] - vwReal0 := vReal0*wReal0 - vImag0*wImag0 - vwImag0 := vReal0*wImag0 + vImag0*wReal0 + vTwReal0 := vReal0*wReal0 - vImag0*wImag0 + vTwImag0 := vReal0*wImag0 + vImag0*wReal0 - vwReal1 := vReal1*wReal1 - vImag1*wImag1 - vwImag1 := vReal1*wImag1 + vImag1*wReal1 + vTwReal1 := vReal1*wReal1 - vImag1*wImag1 + vTwImag1 := vReal1*wImag1 + vImag1*wReal1 - uOutReal0 := uReal0 + vwReal0 - vOutReal0 := uReal0 - vwReal0 + uOutReal0 := uReal0 + vTwReal0 + vOutReal0 := uReal0 - vTwReal0 - uOutReal1 := uReal1 + vwReal1 - vOutReal1 := uReal1 - vwReal1 + uOutReal1 := uReal1 + vTwReal1 + vOutReal1 := uReal1 - vTwReal1 - uOutImag0 := uImag0 + vwImag0 - vOutImag0 := uImag0 - vwImag0 + uOutImag0 := uImag0 + vTwImag0 + vOutImag0 := uImag0 - vTwImag0 - uOutImag1 := uImag1 + vwImag1 - vOutImag1 := uImag1 - vwImag1 + uOutImag1 := uImag1 + vTwImag1 + vOutImag1 := uImag1 - vTwImag1 coeffs[j+0] = uOutReal0 coeffs[j+1] = vOutReal0 @@ -305,23 +305,23 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { vOutImag0 := uImag0 - vImag0 vOutImag1 := uImag1 - vImag1 - vwOutReal0 := vOutReal0*wReal0 - vOutImag0*wImag0 - vwOutReal1 := vOutReal1*wReal1 - vOutImag1*wImag1 + vTwOutReal0 := vOutReal0*wReal0 - vOutImag0*wImag0 + vTwOutReal1 := vOutReal1*wReal1 - vOutImag1*wImag1 - vwOutImag0 := vOutReal0*wImag0 + vOutImag0*wReal0 - vwOutImag1 := vOutReal1*wImag1 + vOutImag1*wReal1 + vTwOutImag0 := vOutReal0*wImag0 + vOutImag0*wReal0 + vTwOutImag1 := vOutReal1*wImag1 + vOutImag1*wReal1 coeffs[j+0] = uOutReal0 - coeffs[j+1] = vwOutReal0 + coeffs[j+1] = vTwOutReal0 coeffs[j+2] = uOutReal1 - coeffs[j+3] = vwOutReal1 + coeffs[j+3] = vTwOutReal1 coeffs[j+4] = uOutImag0 - coeffs[j+5] = vwOutImag0 + coeffs[j+5] = vTwOutImag0 coeffs[j+6] = uOutImag1 - coeffs[j+7] = vwOutImag1 + coeffs[j+7] = vTwOutImag1 } for j := 0; j < N; j += 8 { @@ -353,23 +353,23 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { vOutImag0 := uImag0 - vImag0 vOutImag1 := uImag1 - vImag1 - vwOutReal0 := vOutReal0*wReal - vOutImag0*wImag - vwOutReal1 := vOutReal1*wReal - vOutImag1*wImag + vTwOutReal0 := vOutReal0*wReal - vOutImag0*wImag + vTwOutReal1 := vOutReal1*wReal - vOutImag1*wImag - vwOutImag0 := vOutReal0*wImag + vOutImag0*wReal - vwOutImag1 := vOutReal1*wImag + vOutImag1*wReal + vTwOutImag0 := vOutReal0*wImag + vOutImag0*wReal + vTwOutImag1 := vOutReal1*wImag + vOutImag1*wReal coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 - coeffs[j+2] = vwOutReal0 - coeffs[j+3] = vwOutReal1 + coeffs[j+2] = vTwOutReal0 + coeffs[j+3] = vTwOutReal1 coeffs[j+4] = uOutImag0 coeffs[j+5] = uOutImag1 - coeffs[j+6] = vwOutImag0 - coeffs[j+7] = vwOutImag1 + coeffs[j+6] = vTwOutImag0 + coeffs[j+7] = vTwOutImag1 } t := 8 @@ -423,15 +423,15 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { vOutImag2 := uImag2 - vImag2 vOutImag3 := uImag3 - vImag3 - vwOutReal0 := vOutReal0*wReal - vOutImag0*wImag - vwOutReal1 := vOutReal1*wReal - vOutImag1*wImag - vwOutReal2 := vOutReal2*wReal - vOutImag2*wImag - vwOutReal3 := vOutReal3*wReal - vOutImag3*wImag + vTwOutReal0 := vOutReal0*wReal - vOutImag0*wImag + vTwOutReal1 := vOutReal1*wReal - vOutImag1*wImag + vTwOutReal2 := vOutReal2*wReal - vOutImag2*wImag + vTwOutReal3 := vOutReal3*wReal - vOutImag3*wImag - vwOutImag0 := vOutReal0*wImag + vOutImag0*wReal - vwOutImag1 := vOutReal1*wImag + vOutImag1*wReal - vwOutImag2 := vOutReal2*wImag + vOutImag2*wReal - vwOutImag3 := vOutReal3*wImag + vOutImag3*wReal + vTwOutImag0 := vOutReal0*wImag + vOutImag0*wReal + vTwOutImag1 := vOutReal1*wImag + vOutImag1*wReal + vTwOutImag2 := vOutReal2*wImag + vOutImag2*wReal + vTwOutImag3 := vOutReal3*wImag + vOutImag3*wReal coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 @@ -443,15 +443,15 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { coeffs[j+6] = uOutImag2 coeffs[j+7] = uOutImag3 - coeffs[j+t+0] = vwOutReal0 - coeffs[j+t+1] = vwOutReal1 - coeffs[j+t+2] = vwOutReal2 - coeffs[j+t+3] = vwOutReal3 + coeffs[j+t+0] = vTwOutReal0 + coeffs[j+t+1] = vTwOutReal1 + coeffs[j+t+2] = vTwOutReal2 + coeffs[j+t+3] = vTwOutReal3 - coeffs[j+t+4] = vwOutImag0 - coeffs[j+t+5] = vwOutImag1 - coeffs[j+t+6] = vwOutImag2 - coeffs[j+t+7] = vwOutImag3 + coeffs[j+t+4] = vTwOutImag0 + coeffs[j+t+5] = vTwOutImag1 + coeffs[j+t+6] = vTwOutImag2 + coeffs[j+t+7] = vTwOutImag3 } } t <<= 1 @@ -502,15 +502,15 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { vOutImag2 := uImag2 - vImag2 vOutImag3 := uImag3 - vImag3 - vwOutReal0 := vOutReal0*wReal - vOutImag0*wImag - vwOutReal1 := vOutReal1*wReal - vOutImag1*wImag - vwOutReal2 := vOutReal2*wReal - vOutImag2*wImag - vwOutReal3 := vOutReal3*wReal - vOutImag3*wImag + vTwOutReal0 := vOutReal0*wReal - vOutImag0*wImag + vTwOutReal1 := vOutReal1*wReal - vOutImag1*wImag + vTwOutReal2 := vOutReal2*wReal - vOutImag2*wImag + vTwOutReal3 := vOutReal3*wReal - vOutImag3*wImag - vwOutImag0 := vOutReal0*wImag + vOutImag0*wReal - vwOutImag1 := vOutReal1*wImag + vOutImag1*wReal - vwOutImag2 := vOutReal2*wImag + vOutImag2*wReal - vwOutImag3 := vOutReal3*wImag + vOutImag3*wReal + vTwOutImag0 := vOutReal0*wImag + vOutImag0*wReal + vTwOutImag1 := vOutReal1*wImag + vOutImag1*wReal + vTwOutImag2 := vOutReal2*wImag + vOutImag2*wReal + vTwOutImag3 := vOutReal3*wImag + vOutImag3*wReal coeffs[j+0] = uOutReal0 / scale coeffs[j+1] = uOutReal1 / scale @@ -522,14 +522,14 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { coeffs[j+6] = uOutImag2 / scale coeffs[j+7] = uOutImag3 / scale - coeffs[j+N/2+0] = vwOutReal0 / scale - coeffs[j+N/2+1] = vwOutReal1 / scale - coeffs[j+N/2+2] = vwOutReal2 / scale - coeffs[j+N/2+3] = vwOutReal3 / scale + coeffs[j+N/2+0] = vTwOutReal0 / scale + coeffs[j+N/2+1] = vTwOutReal1 / scale + coeffs[j+N/2+2] = vTwOutReal2 / scale + coeffs[j+N/2+3] = vTwOutReal3 / scale - coeffs[j+N/2+4] = vwOutImag0 / scale - coeffs[j+N/2+5] = vwOutImag1 / scale - coeffs[j+N/2+6] = vwOutImag2 / scale - coeffs[j+N/2+7] = vwOutImag3 / scale + coeffs[j+N/2+4] = vTwOutImag0 / scale + coeffs[j+N/2+5] = vTwOutImag1 / scale + coeffs[j+N/2+6] = vTwOutImag2 / scale + coeffs[j+N/2+7] = vTwOutImag3 / scale } } diff --git a/math/poly/asm_fft_amd64.go b/math/poly/asm_fft_amd64.go index 011c287..a6844de 100644 --- a/math/poly/asm_fft_amd64.go +++ b/math/poly/asm_fft_amd64.go @@ -41,35 +41,35 @@ func fftInPlace(coeffs []float64, tw []complex128) { vImag2 := coeffs[j+N/2+6] vImag3 := coeffs[j+N/2+7] - vwReal0 := vReal0*wReal - vImag0*wImag - vwReal1 := vReal1*wReal - vImag1*wImag - vwReal2 := vReal2*wReal - vImag2*wImag - vwReal3 := vReal3*wReal - vImag3*wImag - - vwImag0 := vReal0*wImag + vImag0*wReal - vwImag1 := vReal1*wImag + vImag1*wReal - vwImag2 := vReal2*wImag + vImag2*wReal - vwImag3 := vReal3*wImag + vImag3*wReal - - uOutReal0 := uReal0 + vwReal0 - uOutReal1 := uReal1 + vwReal1 - uOutReal2 := uReal2 + vwReal2 - uOutReal3 := uReal3 + vwReal3 - - uOutImag0 := uImag0 + vwImag0 - uOutImag1 := uImag1 + vwImag1 - uOutImag2 := uImag2 + vwImag2 - uOutImag3 := uImag3 + vwImag3 - - vOutReal0 := uReal0 - vwReal0 - vOutReal1 := uReal1 - vwReal1 - vOutReal2 := uReal2 - vwReal2 - vOutReal3 := uReal3 - vwReal3 - - vOutImag0 := uImag0 - vwImag0 - vOutImag1 := uImag1 - vwImag1 - vOutImag2 := uImag2 - vwImag2 - vOutImag3 := uImag3 - vwImag3 + vTwReal0 := vReal0*wReal - vImag0*wImag + vTwReal1 := vReal1*wReal - vImag1*wImag + vTwReal2 := vReal2*wReal - vImag2*wImag + vTwReal3 := vReal3*wReal - vImag3*wImag + + vTwImag0 := vReal0*wImag + vImag0*wReal + vTwImag1 := vReal1*wImag + vImag1*wReal + vTwImag2 := vReal2*wImag + vImag2*wReal + vTwImag3 := vReal3*wImag + vImag3*wReal + + uOutReal0 := uReal0 + vTwReal0 + uOutReal1 := uReal1 + vTwReal1 + uOutReal2 := uReal2 + vTwReal2 + uOutReal3 := uReal3 + vTwReal3 + + uOutImag0 := uImag0 + vTwImag0 + uOutImag1 := uImag1 + vTwImag1 + uOutImag2 := uImag2 + vTwImag2 + uOutImag3 := uImag3 + vTwImag3 + + vOutReal0 := uReal0 - vTwReal0 + vOutReal1 := uReal1 - vTwReal1 + vOutReal2 := uReal2 - vTwReal2 + vOutReal3 := uReal3 - vTwReal3 + + vOutImag0 := uImag0 - vTwImag0 + vOutImag1 := uImag1 - vTwImag1 + vOutImag2 := uImag2 - vTwImag2 + vOutImag3 := uImag3 - vTwImag3 coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 @@ -125,35 +125,35 @@ func fftInPlace(coeffs []float64, tw []complex128) { vImag2 := coeffs[j+t+6] vImag3 := coeffs[j+t+7] - vwReal0 := vReal0*wReal - vImag0*wImag - vwReal1 := vReal1*wReal - vImag1*wImag - vwReal2 := vReal2*wReal - vImag2*wImag - vwReal3 := vReal3*wReal - vImag3*wImag + vTwReal0 := vReal0*wReal - vImag0*wImag + vTwReal1 := vReal1*wReal - vImag1*wImag + vTwReal2 := vReal2*wReal - vImag2*wImag + vTwReal3 := vReal3*wReal - vImag3*wImag - vwImag0 := vReal0*wImag + vImag0*wReal - vwImag1 := vReal1*wImag + vImag1*wReal - vwImag2 := vReal2*wImag + vImag2*wReal - vwImag3 := vReal3*wImag + vImag3*wReal + vTwImag0 := vReal0*wImag + vImag0*wReal + vTwImag1 := vReal1*wImag + vImag1*wReal + vTwImag2 := vReal2*wImag + vImag2*wReal + vTwImag3 := vReal3*wImag + vImag3*wReal - uOutReal0 := uReal0 + vwReal0 - uOutReal1 := uReal1 + vwReal1 - uOutReal2 := uReal2 + vwReal2 - uOutReal3 := uReal3 + vwReal3 + uOutReal0 := uReal0 + vTwReal0 + uOutReal1 := uReal1 + vTwReal1 + uOutReal2 := uReal2 + vTwReal2 + uOutReal3 := uReal3 + vTwReal3 - uOutImag0 := uImag0 + vwImag0 - uOutImag1 := uImag1 + vwImag1 - uOutImag2 := uImag2 + vwImag2 - uOutImag3 := uImag3 + vwImag3 + uOutImag0 := uImag0 + vTwImag0 + uOutImag1 := uImag1 + vTwImag1 + uOutImag2 := uImag2 + vTwImag2 + uOutImag3 := uImag3 + vTwImag3 - vOutReal0 := uReal0 - vwReal0 - vOutReal1 := uReal1 - vwReal1 - vOutReal2 := uReal2 - vwReal2 - vOutReal3 := uReal3 - vwReal3 + vOutReal0 := uReal0 - vTwReal0 + vOutReal1 := uReal1 - vTwReal1 + vOutReal2 := uReal2 - vTwReal2 + vOutReal3 := uReal3 - vTwReal3 - vOutImag0 := uImag0 - vwImag0 - vOutImag1 := uImag1 - vwImag1 - vOutImag2 := uImag2 - vwImag2 - vOutImag3 := uImag3 - vwImag3 + vOutImag0 := uImag0 - vTwImag0 + vOutImag1 := uImag1 - vTwImag1 + vOutImag2 := uImag2 - vTwImag2 + vOutImag3 := uImag3 - vTwImag3 coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 @@ -195,23 +195,23 @@ func fftInPlace(coeffs []float64, tw []complex128) { vImag0 := coeffs[j+6] vImag1 := coeffs[j+7] - vwReal0 := vReal0*wReal - vImag0*wImag - vwReal1 := vReal1*wReal - vImag1*wImag + vTwReal0 := vReal0*wReal - vImag0*wImag + vTwReal1 := vReal1*wReal - vImag1*wImag - vwImag0 := vReal0*wImag + vImag0*wReal - vwImag1 := vReal1*wImag + vImag1*wReal + vTwImag0 := vReal0*wImag + vImag0*wReal + vTwImag1 := vReal1*wImag + vImag1*wReal - uOutReal0 := uReal0 + vwReal0 - uOutReal1 := uReal1 + vwReal1 + uOutReal0 := uReal0 + vTwReal0 + uOutReal1 := uReal1 + vTwReal1 - vOutReal0 := uReal0 - vwReal0 - vOutReal1 := uReal1 - vwReal1 + vOutReal0 := uReal0 - vTwReal0 + vOutReal1 := uReal1 - vTwReal1 - uOutImag0 := uImag0 + vwImag0 - uOutImag1 := uImag1 + vwImag1 + uOutImag0 := uImag0 + vTwImag0 + uOutImag1 := uImag1 + vTwImag1 - vOutImag0 := uImag0 - vwImag0 - vOutImag1 := uImag1 - vwImag1 + vOutImag0 := uImag0 - vTwImag0 + vOutImag1 := uImag1 - vTwImag1 coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 @@ -245,23 +245,23 @@ func fftInPlace(coeffs []float64, tw []complex128) { uImag1 := coeffs[j+6] vImag1 := coeffs[j+7] - vwReal0 := vReal0*wReal0 - vImag0*wImag0 - vwImag0 := vReal0*wImag0 + vImag0*wReal0 + vTwReal0 := vReal0*wReal0 - vImag0*wImag0 + vTwImag0 := vReal0*wImag0 + vImag0*wReal0 - vwReal1 := vReal1*wReal1 - vImag1*wImag1 - vwImag1 := vReal1*wImag1 + vImag1*wReal1 + vTwReal1 := vReal1*wReal1 - vImag1*wImag1 + vTwImag1 := vReal1*wImag1 + vImag1*wReal1 - uOutReal0 := uReal0 + vwReal0 - vOutReal0 := uReal0 - vwReal0 + uOutReal0 := uReal0 + vTwReal0 + vOutReal0 := uReal0 - vTwReal0 - uOutReal1 := uReal1 + vwReal1 - vOutReal1 := uReal1 - vwReal1 + uOutReal1 := uReal1 + vTwReal1 + vOutReal1 := uReal1 - vTwReal1 - uOutImag0 := uImag0 + vwImag0 - vOutImag0 := uImag0 - vwImag0 + uOutImag0 := uImag0 + vTwImag0 + vOutImag0 := uImag0 - vTwImag0 - uOutImag1 := uImag1 + vwImag1 - vOutImag1 := uImag1 - vwImag1 + uOutImag1 := uImag1 + vTwImag1 + vOutImag1 := uImag1 - vTwImag1 coeffs[j+0] = uOutReal0 coeffs[j+1] = vOutReal0 @@ -319,23 +319,23 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { vOutImag0 := uImag0 - vImag0 vOutImag1 := uImag1 - vImag1 - vwOutReal0 := vOutReal0*wReal0 - vOutImag0*wImag0 - vwOutReal1 := vOutReal1*wReal1 - vOutImag1*wImag1 + vTwOutReal0 := vOutReal0*wReal0 - vOutImag0*wImag0 + vTwOutReal1 := vOutReal1*wReal1 - vOutImag1*wImag1 - vwOutImag0 := vOutReal0*wImag0 + vOutImag0*wReal0 - vwOutImag1 := vOutReal1*wImag1 + vOutImag1*wReal1 + vTwOutImag0 := vOutReal0*wImag0 + vOutImag0*wReal0 + vTwOutImag1 := vOutReal1*wImag1 + vOutImag1*wReal1 coeffs[j+0] = uOutReal0 - coeffs[j+1] = vwOutReal0 + coeffs[j+1] = vTwOutReal0 coeffs[j+2] = uOutReal1 - coeffs[j+3] = vwOutReal1 + coeffs[j+3] = vTwOutReal1 coeffs[j+4] = uOutImag0 - coeffs[j+5] = vwOutImag0 + coeffs[j+5] = vTwOutImag0 coeffs[j+6] = uOutImag1 - coeffs[j+7] = vwOutImag1 + coeffs[j+7] = vTwOutImag1 } for j := 0; j < N; j += 8 { @@ -367,23 +367,23 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { vOutImag0 := uImag0 - vImag0 vOutImag1 := uImag1 - vImag1 - vwOutReal0 := vOutReal0*wReal - vOutImag0*wImag - vwOutReal1 := vOutReal1*wReal - vOutImag1*wImag + vTwOutReal0 := vOutReal0*wReal - vOutImag0*wImag + vTwOutReal1 := vOutReal1*wReal - vOutImag1*wImag - vwOutImag0 := vOutReal0*wImag + vOutImag0*wReal - vwOutImag1 := vOutReal1*wImag + vOutImag1*wReal + vTwOutImag0 := vOutReal0*wImag + vOutImag0*wReal + vTwOutImag1 := vOutReal1*wImag + vOutImag1*wReal coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 - coeffs[j+2] = vwOutReal0 - coeffs[j+3] = vwOutReal1 + coeffs[j+2] = vTwOutReal0 + coeffs[j+3] = vTwOutReal1 coeffs[j+4] = uOutImag0 coeffs[j+5] = uOutImag1 - coeffs[j+6] = vwOutImag0 - coeffs[j+7] = vwOutImag1 + coeffs[j+6] = vTwOutImag0 + coeffs[j+7] = vTwOutImag1 } t := 8 @@ -437,15 +437,15 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { vOutImag2 := uImag2 - vImag2 vOutImag3 := uImag3 - vImag3 - vwOutReal0 := vOutReal0*wReal - vOutImag0*wImag - vwOutReal1 := vOutReal1*wReal - vOutImag1*wImag - vwOutReal2 := vOutReal2*wReal - vOutImag2*wImag - vwOutReal3 := vOutReal3*wReal - vOutImag3*wImag + vTwOutReal0 := vOutReal0*wReal - vOutImag0*wImag + vTwOutReal1 := vOutReal1*wReal - vOutImag1*wImag + vTwOutReal2 := vOutReal2*wReal - vOutImag2*wImag + vTwOutReal3 := vOutReal3*wReal - vOutImag3*wImag - vwOutImag0 := vOutReal0*wImag + vOutImag0*wReal - vwOutImag1 := vOutReal1*wImag + vOutImag1*wReal - vwOutImag2 := vOutReal2*wImag + vOutImag2*wReal - vwOutImag3 := vOutReal3*wImag + vOutImag3*wReal + vTwOutImag0 := vOutReal0*wImag + vOutImag0*wReal + vTwOutImag1 := vOutReal1*wImag + vOutImag1*wReal + vTwOutImag2 := vOutReal2*wImag + vOutImag2*wReal + vTwOutImag3 := vOutReal3*wImag + vOutImag3*wReal coeffs[j+0] = uOutReal0 coeffs[j+1] = uOutReal1 @@ -457,15 +457,15 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { coeffs[j+6] = uOutImag2 coeffs[j+7] = uOutImag3 - coeffs[j+t+0] = vwOutReal0 - coeffs[j+t+1] = vwOutReal1 - coeffs[j+t+2] = vwOutReal2 - coeffs[j+t+3] = vwOutReal3 + coeffs[j+t+0] = vTwOutReal0 + coeffs[j+t+1] = vTwOutReal1 + coeffs[j+t+2] = vTwOutReal2 + coeffs[j+t+3] = vTwOutReal3 - coeffs[j+t+4] = vwOutImag0 - coeffs[j+t+5] = vwOutImag1 - coeffs[j+t+6] = vwOutImag2 - coeffs[j+t+7] = vwOutImag3 + coeffs[j+t+4] = vTwOutImag0 + coeffs[j+t+5] = vTwOutImag1 + coeffs[j+t+6] = vTwOutImag2 + coeffs[j+t+7] = vTwOutImag3 } } t <<= 1 @@ -516,15 +516,15 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { vOutImag2 := uImag2 - vImag2 vOutImag3 := uImag3 - vImag3 - vwOutReal0 := vOutReal0*wReal - vOutImag0*wImag - vwOutReal1 := vOutReal1*wReal - vOutImag1*wImag - vwOutReal2 := vOutReal2*wReal - vOutImag2*wImag - vwOutReal3 := vOutReal3*wReal - vOutImag3*wImag + vTwOutReal0 := vOutReal0*wReal - vOutImag0*wImag + vTwOutReal1 := vOutReal1*wReal - vOutImag1*wImag + vTwOutReal2 := vOutReal2*wReal - vOutImag2*wImag + vTwOutReal3 := vOutReal3*wReal - vOutImag3*wImag - vwOutImag0 := vOutReal0*wImag + vOutImag0*wReal - vwOutImag1 := vOutReal1*wImag + vOutImag1*wReal - vwOutImag2 := vOutReal2*wImag + vOutImag2*wReal - vwOutImag3 := vOutReal3*wImag + vOutImag3*wReal + vTwOutImag0 := vOutReal0*wImag + vOutImag0*wReal + vTwOutImag1 := vOutReal1*wImag + vOutImag1*wReal + vTwOutImag2 := vOutReal2*wImag + vOutImag2*wReal + vTwOutImag3 := vOutReal3*wImag + vOutImag3*wReal coeffs[j+0] = uOutReal0 / scale coeffs[j+1] = uOutReal1 / scale @@ -536,14 +536,14 @@ func ifftInPlace(coeffs []float64, twInv []complex128) { coeffs[j+6] = uOutImag2 / scale coeffs[j+7] = uOutImag3 / scale - coeffs[j+N/2+0] = vwOutReal0 / scale - coeffs[j+N/2+1] = vwOutReal1 / scale - coeffs[j+N/2+2] = vwOutReal2 / scale - coeffs[j+N/2+3] = vwOutReal3 / scale + coeffs[j+N/2+0] = vTwOutReal0 / scale + coeffs[j+N/2+1] = vTwOutReal1 / scale + coeffs[j+N/2+2] = vTwOutReal2 / scale + coeffs[j+N/2+3] = vTwOutReal3 / scale - coeffs[j+N/2+4] = vwOutImag0 / scale - coeffs[j+N/2+5] = vwOutImag1 / scale - coeffs[j+N/2+6] = vwOutImag2 / scale - coeffs[j+N/2+7] = vwOutImag3 / scale + coeffs[j+N/2+4] = vTwOutImag0 / scale + coeffs[j+N/2+5] = vTwOutImag1 / scale + coeffs[j+N/2+6] = vTwOutImag2 / scale + coeffs[j+N/2+7] = vTwOutImag3 / scale } }