diff --git a/_gen/gen.go b/_gen/gen.go index 941e8a90..eb8d0d0b 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -266,6 +266,10 @@ func VPXOR3way(a, b, dst reg.VecVirtual) { } func genMulAvx2(name string, inputs int, outputs int, xor bool) { + if outputs < 4 { + // Covered by 64-byte version. + return + } const perLoopBits = 5 const perLoop = 1 << perLoopBits diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 237c9ddd..f7273259 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -15,12 +15,6 @@ func sSE2XorSlice_64(in []byte, out []byte) //go:noescape func avx2XorSlice_64(in []byte, out []byte) -// mulAvxTwo_1x1 takes 1 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x1_64 takes 1 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -49,22 +43,11 @@ func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x1Xor takes 1 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x1_64Xor takes 1 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x2 takes 1 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x2_64 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -93,22 +76,11 @@ func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x2Xor takes 1 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x2_64Xor takes 1 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x3 takes 1 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x3_64 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -137,11 +109,6 @@ func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x3Xor takes 1 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x3_64Xor takes 1 inputs and produces 3 outputs. // //go:noescape @@ -378,12 +345,6 @@ func mulAvx2GFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, //go:noescape func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x1 takes 2 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x1_64 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -412,22 +373,11 @@ func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x1Xor takes 2 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x1_64Xor takes 2 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x2 takes 2 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x2_64 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -456,22 +406,11 @@ func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x2Xor takes 2 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x2_64Xor takes 2 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x3 takes 2 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x3_64 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -500,11 +439,6 @@ func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x3Xor takes 2 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x3_64Xor takes 2 inputs and produces 3 outputs. // //go:noescape @@ -741,12 +675,6 @@ func mulAvx2GFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, //go:noescape func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x1 takes 3 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x1_64 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -775,22 +703,11 @@ func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x1Xor takes 3 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x1_64Xor takes 3 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x2 takes 3 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x2_64 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -819,22 +736,11 @@ func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x2Xor takes 3 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x2_64Xor takes 3 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x3 takes 3 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x3_64 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -863,11 +769,6 @@ func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x3Xor takes 3 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x3_64Xor takes 3 inputs and produces 3 outputs. // //go:noescape @@ -1104,12 +1005,6 @@ func mulAvx2GFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, //go:noescape func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x1 takes 4 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x1_64 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -1138,22 +1033,11 @@ func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x1Xor takes 4 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x1_64Xor takes 4 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x2 takes 4 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x2_64 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1182,22 +1066,11 @@ func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x2Xor takes 4 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x2_64Xor takes 4 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x3 takes 4 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x3_64 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1226,11 +1099,6 @@ func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x3Xor takes 4 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x3_64Xor takes 4 inputs and produces 3 outputs. // //go:noescape @@ -1467,12 +1335,6 @@ func mulAvx2GFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, //go:noescape func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x1 takes 5 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x1_64 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -1501,22 +1363,11 @@ func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x1Xor takes 5 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x1_64Xor takes 5 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x2 takes 5 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x2_64 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1545,22 +1396,11 @@ func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x2Xor takes 5 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x2_64Xor takes 5 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x3 takes 5 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x3_64 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1589,11 +1429,6 @@ func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x3Xor takes 5 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x3_64Xor takes 5 inputs and produces 3 outputs. // //go:noescape @@ -1830,12 +1665,6 @@ func mulAvx2GFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, //go:noescape func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x1 takes 6 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x1_64 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -1864,22 +1693,11 @@ func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x1Xor takes 6 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x1_64Xor takes 6 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x2 takes 6 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x2_64 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1908,22 +1726,11 @@ func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x2Xor takes 6 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x2_64Xor takes 6 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x3 takes 6 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x3_64 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1952,11 +1759,6 @@ func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x3Xor takes 6 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x3_64Xor takes 6 inputs and produces 3 outputs. // //go:noescape @@ -2193,12 +1995,6 @@ func mulAvx2GFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, //go:noescape func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x1 takes 7 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x1_64 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -2227,22 +2023,11 @@ func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x1Xor takes 7 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x1_64Xor takes 7 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x2 takes 7 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x2_64 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -2271,22 +2056,11 @@ func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x2Xor takes 7 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x2_64Xor takes 7 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x3 takes 7 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x3_64 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -2315,11 +2089,6 @@ func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x3Xor takes 7 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x3_64Xor takes 7 inputs and produces 3 outputs. // //go:noescape @@ -2556,12 +2325,6 @@ func mulAvx2GFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, //go:noescape func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x1 takes 8 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x1_64 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -2590,22 +2353,11 @@ func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x1Xor takes 8 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x1_64Xor takes 8 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x2 takes 8 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x2_64 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -2634,22 +2386,11 @@ func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x2Xor takes 8 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x2_64Xor takes 8 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x3 takes 8 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x3_64 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -2678,11 +2419,6 @@ func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x3Xor takes 8 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x3_64Xor takes 8 inputs and produces 3 outputs. // //go:noescape @@ -2919,12 +2655,6 @@ func mulAvx2GFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, //go:noescape func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x1 takes 9 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x1_64 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -2953,22 +2683,11 @@ func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x1Xor takes 9 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x1_64Xor takes 9 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x2 takes 9 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x2_64 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -2997,22 +2716,11 @@ func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x2Xor takes 9 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x2_64Xor takes 9 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x3 takes 9 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x3_64 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -3041,11 +2749,6 @@ func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x3Xor takes 9 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x3_64Xor takes 9 inputs and produces 3 outputs. // //go:noescape @@ -3282,12 +2985,6 @@ func mulAvx2GFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, //go:noescape func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x1 takes 10 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x1_64 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -3316,22 +3013,11 @@ func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x1Xor takes 10 inputs and produces 1 outputs. -// -//go:noescape -func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x1_64Xor takes 10 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x2 takes 10 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x2_64 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -3360,22 +3046,11 @@ func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x2Xor takes 10 inputs and produces 2 outputs. -// -//go:noescape -func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x2_64Xor takes 10 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x3 takes 10 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x3_64 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -3404,11 +3079,6 @@ func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvx2GFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x3Xor takes 10 inputs and produces 3 outputs. -// -//go:noescape -func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x3_64Xor takes 10 inputs and produces 3 outputs. // //go:noescape diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index 4e414408..d065f197 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -110,57 +110,6 @@ end: VZEROUPPER RET -// func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 6 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), DX - MOVQ start+72(FP), BX - - // Add start offset to output - ADDQ BX, DX - - // Add start offset to input - ADDQ BX, CX - MOVQ $0x0000000f, BX - MOVQ BX, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_1x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (CX), Y2 - ADDQ $0x20, CX - VPSRLQ $0x04, Y2, Y4 - VPAND Y3, Y2, Y2 - VPAND Y3, Y4, Y4 - VPSHUFB Y2, Y0, Y2 - VPSHUFB Y4, Y1, Y4 - VPXOR Y2, Y4, Y2 - - // Store 1 outputs - VMOVDQU Y2, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x1_loop - VZEROUPPER - -mulAvxTwo_1x1_end: - RET - // func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1_64(SB), $0-88 @@ -401,58 +350,6 @@ mulAvx2GFNI_1x1Xor_loop: mulAvx2GFNI_1x1Xor_end: RET -// func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_1x1Xor(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 6 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), DX - MOVQ start+72(FP), BX - - // Add start offset to output - ADDQ BX, DX - - // Add start offset to input - ADDQ BX, CX - MOVQ $0x0000000f, BX - MOVQ BX, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_1x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (CX), Y4 - ADDQ $0x20, CX - VPSRLQ $0x04, Y4, Y5 - VPAND Y3, Y4, Y4 - VPAND Y3, Y5, Y5 - VMOVDQU (DX), Y2 - VPSHUFB Y4, Y0, Y4 - VPSHUFB Y5, Y1, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 1 outputs - VMOVDQU Y2, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x1Xor_loop - VZEROUPPER - -mulAvxTwo_1x1Xor_end: - RET - // func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88 @@ -517,66 +414,6 @@ mulAvxTwo_1x1_64Xor_loop: mulAvxTwo_1x1_64Xor_end: RET -// func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x2(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 11 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x2_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ start+72(FP), SI - - // Add start offset to output - ADDQ SI, BX - ADDQ SI, DX - - // Add start offset to input - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X6 - VPBROADCASTB X6, Y6 - -mulAvxTwo_1x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (CX), Y8 - ADDQ $0x20, CX - VPSRLQ $0x04, Y8, Y9 - VPAND Y6, Y8, Y8 - VPAND Y6, Y9, Y9 - VPSHUFB Y8, Y0, Y5 - VPSHUFB Y9, Y1, Y7 - VPXOR Y5, Y7, Y4 - VPSHUFB Y8, Y2, Y5 - VPSHUFB Y9, Y3, Y7 - VPXOR Y5, Y7, Y5 - - // Store 2 outputs - VMOVDQU Y4, (BX) - ADDQ $0x20, BX - VMOVDQU Y5, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x2_loop - VZEROUPPER - -mulAvxTwo_1x2_end: - RET - // func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2_64(SB), $0-88 @@ -858,68 +695,6 @@ mulAvx2GFNI_1x2Xor_loop: mulAvx2GFNI_1x2Xor_end: RET -// func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_1x2Xor(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 11 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x2Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ start+72(FP), SI - - // Add start offset to output - ADDQ SI, BX - ADDQ SI, DX - - // Add start offset to input - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X6 - VPBROADCASTB X6, Y6 - -mulAvxTwo_1x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (CX), Y9 - ADDQ $0x20, CX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU (BX), Y4 - VPSHUFB Y9, Y0, Y7 - VPSHUFB Y10, Y1, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - VMOVDQU (DX), Y5 - VPSHUFB Y9, Y2, Y7 - VPSHUFB Y10, Y3, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) - - // Store 2 outputs - VMOVDQU Y4, (BX) - ADDQ $0x20, BX - VMOVDQU Y5, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x2Xor_loop - VZEROUPPER - -mulAvxTwo_1x2Xor_end: - RET - // func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88 @@ -999,75 +774,6 @@ mulAvxTwo_1x2_64Xor_loop: mulAvxTwo_1x2_64Xor_end: RET -// func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x3(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x3_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, BX - ADDQ DI, SI - ADDQ DI, DX - - // Add start offset to input - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_1x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (CX), Y11 - ADDQ $0x20, CX - VPSRLQ $0x04, Y11, Y12 - VPAND Y9, Y11, Y11 - VPAND Y9, Y12, Y12 - VPSHUFB Y11, Y0, Y8 - VPSHUFB Y12, Y1, Y10 - VPXOR Y8, Y10, Y6 - VPSHUFB Y11, Y2, Y8 - VPSHUFB Y12, Y3, Y10 - VPXOR Y8, Y10, Y7 - VPSHUFB Y11, Y4, Y8 - VPSHUFB Y12, Y5, Y10 - VPXOR Y8, Y10, Y8 - - // Store 3 outputs - VMOVDQU Y6, (BX) - ADDQ $0x20, BX - VMOVDQU Y7, (SI) - ADDQ $0x20, SI - VMOVDQU Y8, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x3_loop - VZEROUPPER - -mulAvxTwo_1x3_end: - RET - // func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x3_64(SB), $0-88 @@ -1390,78 +1096,6 @@ mulAvx2GFNI_1x3Xor_loop: mulAvx2GFNI_1x3Xor_end: RET -// func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x3Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, BX - ADDQ DI, SI - ADDQ DI, DX - - // Add start offset to input - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_1x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU (BX), Y6 - VPSHUFB Y12, Y0, Y10 - VPSHUFB Y13, Y1, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) - VMOVDQU (SI), Y7 - VPSHUFB Y12, Y2, Y10 - VPSHUFB Y13, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) - VMOVDQU (DX), Y8 - VPSHUFB Y12, Y4, Y10 - VPSHUFB Y13, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Store 3 outputs - VMOVDQU Y6, (BX) - ADDQ $0x20, BX - VMOVDQU Y7, (SI) - ADDQ $0x20, SI - VMOVDQU Y8, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x3Xor_loop - VZEROUPPER - -mulAvxTwo_1x3Xor_end: - RET - // func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88 @@ -5511,71 +5145,6 @@ mulAvxTwo_1x10Xor_loop: mulAvxTwo_1x10Xor_end: RET -// func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 8 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), BX - MOVQ start+72(FP), SI - - // Add start offset to output - ADDQ SI, BX - - // Add start offset to input - ADDQ SI, DX - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_2x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VPSHUFB Y6, Y0, Y6 - VPSHUFB Y7, Y1, Y7 - VPXOR Y6, Y7, Y4 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (CX), Y6 - ADDQ $0x20, CX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VPSHUFB Y6, Y2, Y6 - VPSHUFB Y7, Y3, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) - - // Store 1 outputs - VMOVDQU Y4, (BX) - ADDQ $0x20, BX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x1_loop - VZEROUPPER - -mulAvxTwo_2x1_end: - RET - // func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1_64(SB), $0-88 @@ -5873,72 +5442,6 @@ mulAvx2GFNI_2x1Xor_loop: mulAvx2GFNI_2x1Xor_end: RET -// func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x1Xor(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 8 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), BX - MOVQ start+72(FP), SI - - // Add start offset to output - ADDQ SI, BX - - // Add start offset to input - ADDQ SI, DX - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_2x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VMOVDQU (BX), Y4 - VPSHUFB Y6, Y0, Y6 - VPSHUFB Y7, Y1, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (CX), Y6 - ADDQ $0x20, CX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VPSHUFB Y6, Y2, Y6 - VPSHUFB Y7, Y3, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) - - // Store 1 outputs - VMOVDQU Y4, (BX) - ADDQ $0x20, BX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x1Xor_loop - VZEROUPPER - -mulAvxTwo_2x1Xor_end: - RET - // func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88 @@ -6024,85 +5527,6 @@ mulAvxTwo_2x1_64Xor_loop: mulAvxTwo_2x1_64Xor_end: RET -// func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 15 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x2_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), SI - MOVQ 24(BX), BX - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, SI - ADDQ DI, BX - - // Add start offset to input - ADDQ DI, DX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X10 - VPBROADCASTB X10, Y10 - -mulAvxTwo_2x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y13 - ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VPSHUFB Y13, Y0, Y11 - VPSHUFB Y14, Y1, Y12 - VPXOR Y11, Y12, Y8 - VPSHUFB Y13, Y2, Y11 - VPSHUFB Y14, Y3, Y12 - VPXOR Y11, Y12, Y9 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (CX), Y13 - ADDQ $0x20, CX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VPSHUFB Y13, Y4, Y11 - VPSHUFB Y14, Y5, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) - VPSHUFB Y13, Y6, Y11 - VPSHUFB Y14, Y7, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) - - // Store 2 outputs - VMOVDQU Y8, (SI) - ADDQ $0x20, SI - VMOVDQU Y9, (BX) - ADDQ $0x20, BX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x2_loop - VZEROUPPER - -mulAvxTwo_2x2_end: - RET - // func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2_64(SB), $0-88 @@ -6461,87 +5885,6 @@ mulAvx2GFNI_2x2Xor_loop: mulAvx2GFNI_2x2Xor_end: RET -// func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x2Xor(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 15 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x2Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), SI - MOVQ 24(BX), BX - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, SI - ADDQ DI, BX - - // Add start offset to input - ADDQ DI, DX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X10 - VPBROADCASTB X10, Y10 - -mulAvxTwo_2x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y13 - ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU (SI), Y8 - VPSHUFB Y13, Y0, Y11 - VPSHUFB Y14, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) - VMOVDQU (BX), Y9 - VPSHUFB Y13, Y2, Y11 - VPSHUFB Y14, Y3, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (CX), Y13 - ADDQ $0x20, CX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VPSHUFB Y13, Y4, Y11 - VPSHUFB Y14, Y5, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) - VPSHUFB Y13, Y6, Y11 - VPSHUFB Y14, Y7, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) - - // Store 2 outputs - VMOVDQU Y8, (SI) - ADDQ $0x20, SI - VMOVDQU Y9, (BX) - ADDQ $0x20, BX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x2Xor_loop - VZEROUPPER - -mulAvxTwo_2x2Xor_end: - RET - // func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88 @@ -6650,99 +5993,6 @@ mulAvxTwo_2x2_64Xor_loop: mulAvxTwo_2x2_64Xor_end: RET -// func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ out_base+48(FP), SI - MOVQ (SI), DI - MOVQ 24(SI), R8 - MOVQ 48(SI), SI - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, DI - ADDQ R9, R8 - ADDQ R9, SI - - // Add start offset to input - ADDQ R9, BX - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_2x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (DI) - ADDQ $0x20, DI - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - VMOVDQU Y2, (SI) - ADDQ $0x20, SI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x3_loop - VZEROUPPER - -mulAvxTwo_2x3_end: - RET - // func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3_64(SB), $0-88 @@ -7162,102 +6412,6 @@ mulAvx2GFNI_2x3Xor_loop: mulAvx2GFNI_2x3Xor_end: RET -// func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ out_base+48(FP), SI - MOVQ (SI), DI - MOVQ 24(SI), R8 - MOVQ 48(SI), SI - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, DI - ADDQ R9, R8 - ADDQ R9, SI - - // Add start offset to input - ADDQ R9, BX - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_2x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (DI), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R8), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (SI), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (DI) - ADDQ $0x20, DI - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - VMOVDQU Y2, (SI) - ADDQ $0x20, SI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x3Xor_loop - VZEROUPPER - -mulAvxTwo_2x3Xor_end: - RET - // func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88 @@ -12716,85 +11870,6 @@ mulAvxTwo_2x10Xor_loop: mulAvxTwo_2x10Xor_end: RET -// func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 10 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), CX - MOVQ out_base+48(FP), SI - MOVQ (SI), SI - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, SI - - // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X7 - VPBROADCASTB X7, Y7 - -mulAvxTwo_3x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y8 - ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y0, Y8 - VPSHUFB Y9, Y1, Y9 - VPXOR Y8, Y9, Y6 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y8 - ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y2, Y8 - VPSHUFB Y9, Y3, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (CX), Y8 - ADDQ $0x20, CX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y4, Y8 - VPSHUFB Y9, Y5, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) - - // Store 1 outputs - VMOVDQU Y6, (SI) - ADDQ $0x20, SI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x1_loop - VZEROUPPER - -mulAvxTwo_3x1_end: - RET - // func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1_64(SB), $0-88 @@ -13149,86 +12224,6 @@ mulAvx2GFNI_3x1Xor_loop: mulAvx2GFNI_3x1Xor_end: RET -// func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x1Xor(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 10 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), CX - MOVQ out_base+48(FP), SI - MOVQ (SI), SI - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, SI - - // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X7 - VPBROADCASTB X7, Y7 - -mulAvxTwo_3x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y8 - ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VMOVDQU (SI), Y6 - VPSHUFB Y8, Y0, Y8 - VPSHUFB Y9, Y1, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y8 - ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y2, Y8 - VPSHUFB Y9, Y3, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (CX), Y8 - ADDQ $0x20, CX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y4, Y8 - VPSHUFB Y9, Y5, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) - - // Store 1 outputs - VMOVDQU Y6, (SI) - ADDQ $0x20, SI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x1Xor_loop - VZEROUPPER - -mulAvxTwo_3x1Xor_end: - RET - // func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88 @@ -13335,104 +12330,6 @@ mulAvxTwo_3x1_64Xor_loop: mulAvxTwo_3x1_64Xor_end: RET -// func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 19 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), DI - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, R8 - ADDQ R9, DI - - // Add start offset to input - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_3x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x2_loop - VZEROUPPER - -mulAvxTwo_3x2_end: - RET - // func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2_64(SB), $0-88 @@ -13868,106 +12765,6 @@ mulAvx2GFNI_3x2Xor_loop: mulAvx2GFNI_3x2Xor_end: RET -// func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 19 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), DI - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, R8 - ADDQ R9, DI - - // Add start offset to input - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_3x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R8), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (DI), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x2Xor_loop - VZEROUPPER - -mulAvxTwo_3x2Xor_end: - RET - // func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88 @@ -14105,123 +12902,6 @@ mulAvxTwo_3x2_64Xor_loop: mulAvxTwo_3x2_64Xor_end: RET -// func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 26 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), R9 - MOVQ 48(DI), DI - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R8 - ADDQ R10, R9 - ADDQ R10, DI - - // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_3x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - VMOVDQU Y2, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x3_loop - VZEROUPPER - -mulAvxTwo_3x3_end: - RET - // func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3_64(SB), $0-88 @@ -14738,126 +13418,6 @@ mulAvx2GFNI_3x3Xor_loop: mulAvx2GFNI_3x3Xor_end: RET -// func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 26 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), R9 - MOVQ 48(DI), DI - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R8 - ADDQ R10, R9 - ADDQ R10, DI - - // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_3x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R8), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R9), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (DI), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - VMOVDQU Y2, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x3Xor_loop - VZEROUPPER - -mulAvxTwo_3x3Xor_end: - RET - // func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88 @@ -21709,99 +20269,6 @@ mulAvxTwo_3x10Xor_loop: mulAvxTwo_3x10Xor_end: RET -// func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 12 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), CX - MOVQ out_base+48(FP), DI - MOVQ (DI), DI - MOVQ start+72(FP), R8 - - // Add start offset to output - ADDQ R8, DI - - // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, CX - MOVQ $0x0000000f, R8 - MOVQ R8, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_4x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y0, Y10 - VPSHUFB Y11, Y1, Y11 - VPXOR Y10, Y11, Y8 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y10 - ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y2, Y10 - VPSHUFB Y11, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y10 - ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y4, Y10 - VPSHUFB Y11, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (CX), Y10 - ADDQ $0x20, CX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y6, Y10 - VPSHUFB Y11, Y7, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Store 1 outputs - VMOVDQU Y8, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x1_loop - VZEROUPPER - -mulAvxTwo_4x1_end: - RET - // func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1_64(SB), $0-88 @@ -22213,100 +20680,6 @@ mulAvx2GFNI_4x1Xor_loop: mulAvx2GFNI_4x1Xor_end: RET -// func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x1Xor(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 12 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), CX - MOVQ out_base+48(FP), DI - MOVQ (DI), DI - MOVQ start+72(FP), R8 - - // Add start offset to output - ADDQ R8, DI - - // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, CX - MOVQ $0x0000000f, R8 - MOVQ R8, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_4x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VMOVDQU (DI), Y8 - VPSHUFB Y10, Y0, Y10 - VPSHUFB Y11, Y1, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y10 - ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y2, Y10 - VPSHUFB Y11, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y10 - ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y4, Y10 - VPSHUFB Y11, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (CX), Y10 - ADDQ $0x20, CX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y6, Y10 - VPSHUFB Y11, Y7, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Store 1 outputs - VMOVDQU Y8, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x1Xor_loop - VZEROUPPER - -mulAvxTwo_4x1Xor_end: - RET - // func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88 @@ -22434,123 +20807,6 @@ mulAvxTwo_4x1_64Xor_loop: mulAvxTwo_4x1_64Xor_end: RET -// func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 23 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R8 - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R9 - ADDQ R10, R8 - - // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_4x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x2_loop - VZEROUPPER - -mulAvxTwo_4x2_end: - RET - // func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2_64(SB), $0-88 @@ -23063,125 +21319,6 @@ mulAvx2GFNI_4x2Xor_loop: mulAvx2GFNI_4x2Xor_end: RET -// func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 23 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R8 - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R9 - ADDQ R10, R8 - - // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_4x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R9), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R8), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x2Xor_loop - VZEROUPPER - -mulAvxTwo_4x2Xor_end: - RET - // func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88 @@ -23348,147 +21485,6 @@ mulAvxTwo_4x2_64Xor_loop: mulAvxTwo_4x2_64Xor_end: RET -// func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 32 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R10 - MOVQ 48(R8), R8 - MOVQ start+72(FP), R11 - - // Add start offset to output - ADDQ R11, R9 - ADDQ R11, R10 - ADDQ R11, R8 - - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_4x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R10) - ADDQ $0x20, R10 - VMOVDQU Y2, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x3_loop - VZEROUPPER - -mulAvxTwo_4x3_end: - RET - // func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3_64(SB), $0-88 @@ -24102,150 +22098,6 @@ mulAvx2GFNI_4x3Xor_loop: mulAvx2GFNI_4x3Xor_end: RET -// func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 32 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R10 - MOVQ 48(R8), R8 - MOVQ start+72(FP), R11 - - // Add start offset to output - ADDQ R11, R9 - ADDQ R11, R10 - ADDQ R11, R8 - - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_4x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R9), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R10), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R8), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R10) - ADDQ $0x20, R10 - VMOVDQU Y2, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x3Xor_loop - VZEROUPPER - -mulAvxTwo_4x3Xor_end: - RET - // func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88 @@ -32346,113 +30198,6 @@ mulAvxTwo_4x10Xor_loop: mulAvxTwo_4x10Xor_end: RET -// func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), CX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R8 - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, R8 - - // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, CX - MOVQ $0x0000000f, R9 - MOVQ R9, X11 - VPBROADCASTB X11, Y11 - -mulAvxTwo_5x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y12 - ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y0, Y12 - VPSHUFB Y13, Y1, Y13 - VPXOR Y12, Y13, Y10 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y2, Y12 - VPSHUFB Y13, Y3, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y4, Y12 - VPSHUFB Y13, Y5, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y12 - ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y6, Y12 - VPSHUFB Y13, Y7, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y8, Y12 - VPSHUFB Y13, Y9, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Store 1 outputs - VMOVDQU Y10, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x1_loop - VZEROUPPER - -mulAvxTwo_5x1_end: - RET - // func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1_64(SB), $0-88 @@ -32921,114 +30666,6 @@ mulAvx2GFNI_5x1Xor_loop: mulAvx2GFNI_5x1Xor_end: RET -// func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x1Xor(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), CX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R8 - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, R8 - - // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, CX - MOVQ $0x0000000f, R9 - MOVQ R9, X11 - VPBROADCASTB X11, Y11 - -mulAvxTwo_5x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y12 - ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VMOVDQU (R8), Y10 - VPSHUFB Y12, Y0, Y12 - VPSHUFB Y13, Y1, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y2, Y12 - VPSHUFB Y13, Y3, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y4, Y12 - VPSHUFB Y13, Y5, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y12 - ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y6, Y12 - VPSHUFB Y13, Y7, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y8, Y12 - VPSHUFB Y13, Y9, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Store 1 outputs - VMOVDQU Y10, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x1Xor_loop - VZEROUPPER - -mulAvxTwo_5x1Xor_end: - RET - // func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88 @@ -33177,142 +30814,6 @@ mulAvxTwo_5x1_64Xor_loop: mulAvxTwo_5x1_64Xor_end: RET -// func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 27 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R9 - MOVQ start+72(FP), R11 - - // Add start offset to output - ADDQ R11, R10 - ADDQ R11, R9 - - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_5x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x2_loop - VZEROUPPER - -mulAvxTwo_5x2_end: - RET - // func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2_64(SB), $0-88 @@ -33902,144 +31403,6 @@ mulAvx2GFNI_5x2Xor_loop: mulAvx2GFNI_5x2Xor_end: RET -// func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 27 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R9 - MOVQ start+72(FP), R11 - - // Add start offset to output - ADDQ R11, R10 - ADDQ R11, R9 - - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_5x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R10), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R9), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x2Xor_loop - VZEROUPPER - -mulAvxTwo_5x2Xor_end: - RET - // func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88 @@ -34235,171 +31598,6 @@ mulAvxTwo_5x2_64Xor_loop: mulAvxTwo_5x2_64Xor_end: RET -// func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 38 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R11 - MOVQ 48(R9), R9 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R10 - ADDQ R12, R11 - ADDQ R12, R9 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_5x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R11) - ADDQ $0x20, R11 - VMOVDQU Y2, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x3_loop - VZEROUPPER - -mulAvxTwo_5x3_end: - RET - // func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3_64(SB), $0-88 @@ -35110,174 +32308,6 @@ mulAvx2GFNI_5x3Xor_loop: mulAvx2GFNI_5x3Xor_end: RET -// func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 38 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R11 - MOVQ 48(R9), R9 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R10 - ADDQ R12, R11 - ADDQ R12, R9 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_5x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R10), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R11), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R9), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R11) - ADDQ $0x20, R11 - VMOVDQU Y2, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x3Xor_loop - VZEROUPPER - -mulAvxTwo_5x3Xor_end: - RET - // func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88 @@ -44620,127 +41650,6 @@ mulAvxTwo_5x10Xor_loop: mulAvxTwo_5x10Xor_end: RET -// func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 16 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), CX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R9 - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R9 - - // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, CX - MOVQ $0x0000000f, R10 - MOVQ R10, X13 - VPBROADCASTB X13, Y13 - -mulAvxTwo_6x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y14 - ADDQ $0x20, DX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y0, Y14 - VPSHUFB Y15, Y1, Y15 - VPXOR Y14, Y15, Y12 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y14 - ADDQ $0x20, BX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y2, Y14 - VPSHUFB Y15, Y3, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y14 - ADDQ $0x20, SI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y4, Y14 - VPSHUFB Y15, Y5, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y14 - ADDQ $0x20, DI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y6, Y14 - VPSHUFB Y15, Y7, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y14 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y8, Y14 - VPSHUFB Y15, Y9, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (CX), Y14 - ADDQ $0x20, CX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y10, Y14 - VPSHUFB Y15, Y11, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Store 1 outputs - VMOVDQU Y12, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x1_loop - VZEROUPPER - -mulAvxTwo_6x1_end: - RET - // func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1_64(SB), $0-88 @@ -45266,128 +42175,6 @@ mulAvx2GFNI_6x1Xor_loop: mulAvx2GFNI_6x1Xor_end: RET -// func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x1Xor(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 16 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), CX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R9 - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R9 - - // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, CX - MOVQ $0x0000000f, R10 - MOVQ R10, X13 - VPBROADCASTB X13, Y13 - -mulAvxTwo_6x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y14 - ADDQ $0x20, DX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VMOVDQU (R9), Y12 - VPSHUFB Y14, Y0, Y14 - VPSHUFB Y15, Y1, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y14 - ADDQ $0x20, BX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y2, Y14 - VPSHUFB Y15, Y3, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y14 - ADDQ $0x20, SI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y4, Y14 - VPSHUFB Y15, Y5, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y14 - ADDQ $0x20, DI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y6, Y14 - VPSHUFB Y15, Y7, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y14 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y8, Y14 - VPSHUFB Y15, Y9, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (CX), Y14 - ADDQ $0x20, CX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y10, Y14 - VPSHUFB Y15, Y11, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Store 1 outputs - VMOVDQU Y12, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x1Xor_loop - VZEROUPPER - -mulAvxTwo_6x1Xor_end: - RET - // func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88 @@ -45557,161 +42344,6 @@ mulAvxTwo_6x1_64Xor_loop: mulAvxTwo_6x1_64Xor_end: RET -// func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 31 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R10 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - ADDQ R12, R10 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_6x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R10) - ADDQ $0x20, R10 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x2_loop - VZEROUPPER - -mulAvxTwo_6x2_end: - RET - // func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2_64(SB), $0-88 @@ -46378,163 +43010,6 @@ mulAvx2GFNI_6x2Xor_loop: mulAvx2GFNI_6x2Xor_end: RET -// func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 31 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R10 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - ADDQ R12, R10 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_6x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R11), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R10), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R10) - ADDQ $0x20, R10 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x2Xor_loop - VZEROUPPER - -mulAvxTwo_6x2Xor_end: - RET - // func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88 @@ -46759,195 +43234,6 @@ mulAvxTwo_6x2_64Xor_loop: mulAvxTwo_6x2_64Xor_end: RET -// func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 44 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R10 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R11 - ADDQ R13, R12 - ADDQ R13, R10 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_6x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - VMOVDQU Y2, (R10) - ADDQ $0x20, R10 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x3_loop - VZEROUPPER - -mulAvxTwo_6x3_end: - RET - // func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3_64(SB), $0-88 @@ -47755,198 +44041,6 @@ mulAvx2GFNI_6x3Xor_loop: mulAvx2GFNI_6x3Xor_end: RET -// func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 44 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R10 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R11 - ADDQ R13, R12 - ADDQ R13, R10 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_6x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R11), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R12), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R10), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - VMOVDQU Y2, (R10) - ADDQ $0x20, R10 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x3Xor_loop - VZEROUPPER - -mulAvxTwo_6x3Xor_end: - RET - // func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88 @@ -58530,141 +54624,6 @@ mulAvxTwo_6x10Xor_loop: mulAvxTwo_6x10Xor_end: RET -// func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 18 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R11 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_7x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x1_loop - VZEROUPPER - -mulAvxTwo_7x1_end: - RET - // func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1_64(SB), $0-88 @@ -59247,142 +55206,6 @@ mulAvx2GFNI_7x1Xor_loop: mulAvx2GFNI_7x1Xor_end: RET -// func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 18 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R11 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_7x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R11), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x1Xor_loop - VZEROUPPER - -mulAvxTwo_7x1Xor_end: - RET - // func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 @@ -59573,180 +55396,6 @@ mulAvxTwo_7x1_64Xor_loop: mulAvxTwo_7x1_64Xor_end: RET -// func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 35 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R11 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R12 - ADDQ R13, R11 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_7x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x2_loop - VZEROUPPER - -mulAvxTwo_7x2_end: - RET - // func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2_64(SB), $0-88 @@ -60490,182 +56139,6 @@ mulAvx2GFNI_7x2Xor_loop: mulAvx2GFNI_7x2Xor_end: RET -// func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 35 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R11 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R12 - ADDQ R13, R11 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_7x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R12), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R11), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x2Xor_loop - VZEROUPPER - -mulAvxTwo_7x2Xor_end: - RET - // func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 @@ -60919,219 +56392,6 @@ mulAvxTwo_7x2_64Xor_loop: mulAvxTwo_7x2_64Xor_end: RET -// func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 50 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R11 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R12 - ADDQ R14, R13 - ADDQ R14, R11 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_7x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - VMOVDQU Y2, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x3_loop - VZEROUPPER - -mulAvxTwo_7x3_end: - RET - // func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3_64(SB), $0-88 @@ -62036,222 +57296,6 @@ mulAvx2GFNI_7x3Xor_loop: mulAvx2GFNI_7x3Xor_end: RET -// func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 50 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R11 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R12 - ADDQ R14, R13 - ADDQ R14, R11 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_7x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R12), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R13), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R11), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - VMOVDQU Y2, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x3Xor_loop - VZEROUPPER - -mulAvxTwo_7x3Xor_end: - RET - // func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88 @@ -74081,155 +69125,6 @@ mulAvxTwo_7x10Xor_loop: mulAvxTwo_7x10Xor_end: RET -// func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R12 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R12 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_8x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x1_loop - VZEROUPPER - -mulAvxTwo_8x1_end: - RET - // func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1_64(SB), $0-88 @@ -74869,156 +69764,6 @@ mulAvx2GFNI_8x1Xor_loop: mulAvx2GFNI_8x1Xor_end: RET -// func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R12 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R12 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_8x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R12), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x1Xor_loop - VZEROUPPER - -mulAvxTwo_8x1Xor_end: - RET - // func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88 @@ -75230,199 +69975,6 @@ mulAvxTwo_8x1_64Xor_loop: mulAvxTwo_8x1_64Xor_end: RET -// func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 39 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R12 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R13 - ADDQ R14, R12 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_8x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x2_loop - VZEROUPPER - -mulAvxTwo_8x2_end: - RET - // func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2_64(SB), $0-88 @@ -76243,201 +70795,6 @@ mulAvx2GFNI_8x2Xor_loop: mulAvx2GFNI_8x2Xor_end: RET -// func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 39 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R12 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R13 - ADDQ R14, R12 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_8x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R13), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R12), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x2Xor_loop - VZEROUPPER - -mulAvxTwo_8x2Xor_end: - RET - // func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88 @@ -76720,243 +71077,6 @@ mulAvxTwo_8x2_64Xor_loop: mulAvxTwo_8x2_64Xor_end: RET -// func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 56 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R12 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R12 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_8x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - VMOVDQU Y2, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x3_loop - VZEROUPPER - -mulAvxTwo_8x3_end: - RET - // func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3_64(SB), $0-88 @@ -77958,246 +72078,6 @@ mulAvx2GFNI_8x3Xor_loop: mulAvx2GFNI_8x3Xor_end: RET -// func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 56 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R12 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R12 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_8x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R13), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R14), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R12), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - VMOVDQU Y2, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x3Xor_loop - VZEROUPPER - -mulAvxTwo_8x3Xor_end: - RET - // func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88 @@ -91278,169 +85158,6 @@ mulAvxTwo_8x10Xor_loop: mulAvxTwo_8x10Xor_end: RET -// func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 22 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R13 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R13 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_9x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x1_loop - VZEROUPPER - -mulAvxTwo_9x1_end: - RET - // func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1_64(SB), $0-88 @@ -92137,170 +85854,6 @@ mulAvx2GFNI_9x1Xor_loop: mulAvx2GFNI_9x1Xor_end: RET -// func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 22 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R13 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R13 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_9x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R13), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x1Xor_loop - VZEROUPPER - -mulAvxTwo_9x1Xor_end: - RET - // func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 @@ -92533,218 +86086,6 @@ mulAvxTwo_9x1_64Xor_loop: mulAvxTwo_9x1_64Xor_end: RET -// func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 43 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R13 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R14 - ADDQ R15, R13 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_9x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x2_loop - VZEROUPPER - -mulAvxTwo_9x2_end: - RET - // func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2_64(SB), $0-88 @@ -93642,220 +86983,6 @@ mulAvx2GFNI_9x2Xor_loop: mulAvx2GFNI_9x2Xor_end: RET -// func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 43 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R13 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R14 - ADDQ R15, R13 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_9x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R13), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x2Xor_loop - VZEROUPPER - -mulAvxTwo_9x2Xor_end: - RET - // func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88 @@ -94167,267 +87294,6 @@ mulAvxTwo_9x2_64Xor_loop: mulAvxTwo_9x2_64Xor_end: RET -// func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 62 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_9x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x3_loop - VZEROUPPER - -mulAvxTwo_9x3_end: - RET - // func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3_64(SB), $8-88 @@ -95526,270 +88392,6 @@ mulAvx2GFNI_9x3Xor_loop: mulAvx2GFNI_9x3Xor_end: RET -// func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 62 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_9x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R15), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R13), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x3Xor_loop - VZEROUPPER - -mulAvxTwo_9x3Xor_end: - RET - // func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88 @@ -110130,183 +102732,6 @@ mulAvxTwo_9x10Xor_loop: mulAvxTwo_9x10Xor_end: RET -// func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 24 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R14 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R14 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_10x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (R13), Y4 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 9 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 576(CX), Y2 - VMOVDQU 608(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_10x1_loop - VZEROUPPER - -mulAvxTwo_10x1_end: - RET - // func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1_64(SB), $0-88 @@ -111060,184 +103485,6 @@ mulAvx2GFNI_10x1Xor_loop: mulAvx2GFNI_10x1Xor_end: RET -// func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 24 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R14 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R14 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_10x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (R13), Y4 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 9 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 576(CX), Y2 - VMOVDQU 608(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_10x1Xor_loop - VZEROUPPER - -mulAvxTwo_10x1Xor_end: - RET - // func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88 @@ -111491,237 +103738,6 @@ mulAvxTwo_10x1_64Xor_loop: mulAvxTwo_10x1_64Xor_end: RET -// func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 47 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R15 - MOVQ 24(R14), R14 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R15 - ADDQ BP, R14 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_10x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (R13), Y5 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 9 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1152(CX), Y3 - VMOVDQU 1184(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1216(CX), Y3 - VMOVDQU 1248(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R15) - ADDQ $0x20, R15 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_10x2_loop - VZEROUPPER - -mulAvxTwo_10x2_end: - RET - // func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2_64(SB), $8-88 @@ -112696,239 +104712,6 @@ mulAvx2GFNI_10x2Xor_loop: mulAvx2GFNI_10x2Xor_end: RET -// func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x2Xor(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 47 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R15 - MOVQ 24(R14), R14 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R15 - ADDQ BP, R14 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_10x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R15), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R14), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (R13), Y5 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 9 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1152(CX), Y3 - VMOVDQU 1184(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1216(CX), Y3 - VMOVDQU 1248(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R15) - ADDQ $0x20, R15 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_10x2Xor_loop - VZEROUPPER - -mulAvxTwo_10x2Xor_end: - RET - // func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88 @@ -113269,293 +105052,6 @@ mulAvxTwo_10x2_64Xor_loop: mulAvxTwo_10x2_64Xor_end: RET -// func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 68 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x3_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_10x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 9 to 3 outputs - VMOVDQU (AX), Y6 - ADDQ $0x20, AX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1728(CX), Y4 - VMOVDQU 1760(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1792(CX), Y4 - VMOVDQU 1824(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1856(CX), Y4 - VMOVDQU 1888(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ BP - JNZ mulAvxTwo_10x3_loop - VZEROUPPER - -mulAvxTwo_10x3_end: - RET - // func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3_64(SB), $8-88 @@ -114765,296 +106261,6 @@ mulAvx2GFNI_10x3Xor_loop: mulAvx2GFNI_10x3Xor_end: RET -// func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 68 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x3Xor_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_10x3Xor_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R15), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R13), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 9 to 3 outputs - VMOVDQU (AX), Y6 - ADDQ $0x20, AX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1728(CX), Y4 - VMOVDQU 1760(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1792(CX), Y4 - VMOVDQU 1824(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1856(CX), Y4 - VMOVDQU 1888(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ BP - JNZ mulAvxTwo_10x3Xor_loop - VZEROUPPER - -mulAvxTwo_10x3Xor_end: - RET - // func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88